diff options
Diffstat (limited to 'usr/src/cmd')
48 files changed, 6262 insertions, 4984 deletions
diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c index 34bb772632..5a4779cfa5 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/agent.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,6 +133,7 @@ main(int argc, char **argv) boolean_t is_verbose; int ipc_fd; int c; + int aware = RTAW_UNDER_IPMP; struct rlimit rl; debug_level = df_get_int("", B_FALSE, DF_DEBUG_LEVEL); @@ -301,6 +302,17 @@ main(int argc, char **argv) dhcpmsg(MSG_ERR, "cannot open routing socket"); return (EXIT_FAILURE); } + + /* + * We're IPMP-aware and can manage IPMP test addresses, so issue + * RT_AWARE to get routing socket messages for interfaces under IPMP. + */ + if (setsockopt(rtsock_fd, SOL_ROUTE, RT_AWARE, &aware, + sizeof (aware)) == -1) { + dhcpmsg(MSG_ERR, "cannot set RT_AWARE on routing socket"); + return (EXIT_FAILURE); + } + if (iu_register_event(eh, rtsock_fd, POLLIN, rtsock_event, 0) == -1) { dhcpmsg(MSG_ERR, "cannot register routing socket for messages"); return (EXIT_FAILURE); @@ -1182,7 +1194,7 @@ check_lif(dhcp_lif_t *lif, const struct ifa_msghdr *ifam, int msglen) lif->lif_name); lif_mark_decline(lif, "duplicate address"); close_ip_lif(lif); - (void) open_ip_lif(lif, INADDR_ANY); + (void) open_ip_lif(lif, INADDR_ANY, B_TRUE); } dad_wait = lif->lif_dad_wait; diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c index 4637ecc346..6cfce9f0a9 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/bound.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * BOUND state of the DHCP client state machine. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/socket.h> #include <sys/types.h> #include <string.h> @@ -358,7 +356,8 @@ dhcp_bound_complete(dhcp_smach_t *dsmp) lif = dsmp->dsm_lif; if (router_list != NULL && (router_list->len % sizeof (ipaddr_t)) == 0 && - strchr(lif->lif_name, ':') == NULL) { + strchr(lif->lif_name, ':') == NULL && + !lif->lif_pif->pif_under_ipmp) { dsmp->dsm_nrouters = router_list->len / sizeof (ipaddr_t); dsmp->dsm_routers = malloc(router_list->len); diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c index 0cfdad40e3..5d2d5fb99e 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -76,6 +76,7 @@ insert_pif(const char *pname, boolean_t isv6, int *error) { dhcp_pif_t *pif; struct lifreq lifr; + lifgroupinfo_t lifgr; dlpi_handle_t dh = NULL; int fd = isv6 ? v6_sock_fd : v4_sock_fd; @@ -127,12 +128,60 @@ insert_pif(const char *pname, boolean_t isv6, int *error) } /* - * For IPv4, use DLPI to determine the hardware type, hardware - * address, and hardware address length. + * Check if the pif is in an IPMP group. Interfaces using IPMP don't + * have dedicated hardware addresses, and get their hardware type from + * the SIOCGLIFGROUPINFO ioctl rather than DLPI. */ - if (!isv6) { - int rc; - dlpi_info_t dlinfo; + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFGROUPNAME for %s", pname); + goto failure; + } + + if (lifr.lifr_groupname[0] != '\0') { + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPINFO, &lifgr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFGROUPINFO for %s", + lifgr.gi_grname); + goto failure; + } + + pif->pif_hwtype = dlpi_arptype(lifgr.gi_mactype); + pif->pif_under_ipmp = (strcmp(pname, lifgr.gi_grifname) != 0); + (void) strlcpy(pif->pif_grifname, lifgr.gi_grifname, LIFNAMSIZ); + + /* + * For IPMP underlying interfaces, stash the interface index + * of the IPMP meta-interface; we'll use it to send/receive + * traffic. This is both necessary (since IP_BOUND_IF for + * non-unicast traffic won't work on underlying interfaces) + * and preferred (since a test address lease will be able to + * be maintained as long as another interface in the group is + * still functioning). + */ + if (pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, + LIFNAMSIZ); + + if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { + *error = DHCP_IPC_E_INT; + dhcpmsg(MSG_ERR, "insert_pif: SIOCGLIFINDEX " + "for %s", lifr.lifr_name); + goto failure; + } + pif->pif_grindex = lifr.lifr_index; + } + } + + /* + * For IPv4, if the hardware type is still unknown, use DLPI to + * determine it, the hardware address, and hardware address length. + */ + if (!isv6 && pif->pif_hwtype == 0) { + int rc; + dlpi_info_t dlinfo; if ((rc = dlpi_open(pname, &dh, 0)) != DLPI_SUCCESS) { dhcpmsg(MSG_ERROR, "insert_pif: dlpi_open: %s", @@ -661,11 +710,12 @@ verify_lif(const dhcp_lif_t *lif) boolean_t isv6; int fd; struct lifreq lifr; + dhcp_pif_t *pif = lif->lif_pif; (void) memset(&lifr, 0, sizeof (struct lifreq)); (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); - isv6 = lif->lif_pif->pif_isv6; + isv6 = pif->pif_isv6; fd = isv6 ? v6_sock_fd : v4_sock_fd; if (ioctl(fd, SIOCGLIFFLAGS, &lifr) == -1) { @@ -689,43 +739,41 @@ verify_lif(const dhcp_lif_t *lif) } /* - * Special case: if the interface has gone down as a duplicate, then - * this alone does _not_ mean that we're abandoning it just yet. Allow - * the state machine to handle this normally by trying to get a new - * lease. - */ - if ((lifr.lifr_flags & (IFF_UP|IFF_DUPLICATE)) == IFF_DUPLICATE) { - dhcpmsg(MSG_DEBUG, "verify_lif: duplicate address on %s", - lif->lif_name); - return (B_TRUE); - } - - /* - * If the user has torn down or started up the interface manually, then - * abandon the lease. - */ - if ((lif->lif_flags ^ lifr.lifr_flags) & IFF_UP) { - dhcpmsg(MSG_DEBUG, "verify_lif: user has %s %s", - lifr.lifr_flags & IFF_UP ? "started up" : "shut down", - lif->lif_name); - return (B_FALSE); - } - - /* * Check for delete and recreate. */ if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { - dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX failed on %s", - lif->lif_name); + if (errno != ENXIO) { + dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX failed " + "on %s", lif->lif_name); + } return (B_FALSE); } - if (lifr.lifr_index != lif->lif_pif->pif_index) { + if (lifr.lifr_index != pif->pif_index) { dhcpmsg(MSG_DEBUG, "verify_lif: ifindex on %s changed: %u to %u", - lif->lif_name, lif->lif_pif->pif_index, lifr.lifr_index); + lif->lif_name, pif->pif_index, lifr.lifr_index); return (B_FALSE); } + if (pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, LIFNAMSIZ); + + if (ioctl(fd, SIOCGLIFINDEX, &lifr) == -1) { + if (errno != ENXIO) { + dhcpmsg(MSG_ERR, "verify_lif: SIOCGLIFINDEX " + "failed on %s", lifr.lifr_name); + } + return (B_FALSE); + } + + if (lifr.lifr_index != pif->pif_grindex) { + dhcpmsg(MSG_DEBUG, "verify_lif: IPMP group ifindex " + "on %s changed: %u to %u", lifr.lifr_name, + pif->pif_grindex, lifr.lifr_index); + return (B_FALSE); + } + } + /* * If the IP address, netmask, or broadcast address have changed, or * the interface has been unplumbed, then we act like there has been an @@ -934,6 +982,13 @@ plumb_lif(dhcp_pif_t *pif, const in6_addr_t *addr) lifr.lifr_name); goto failure; } + + /* + * See comment in set_lif_dhcp(). + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + lifr.lifr_flags |= IFF_UP | IFF_DHCPRUNNING; if (ioctl(v6_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { dhcpmsg(MSG_ERR, "plumb_lif: SIOCSLIFFLAGS %s", @@ -1060,8 +1115,9 @@ set_lif_dhcp(dhcp_lif_t *lif, boolean_t is_adopting) int fd; int err; struct lifreq lifr; + dhcp_pif_t *pif = lif->lif_pif; - fd = lif->lif_pif->pif_isv6 ? v6_sock_fd : v4_sock_fd; + fd = pif->pif_isv6 ? v6_sock_fd : v4_sock_fd; (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); @@ -1098,6 +1154,17 @@ set_lif_dhcp(dhcp_lif_t *lif, boolean_t is_adopting) "set on %s", lif->lif_name); } } else { + /* + * If the lif is on an interface under IPMP, IFF_NOFAILOVER + * must be set or the kernel will prevent us from setting + * IFF_DHCPRUNNING (since the subsequent IFF_UP would lead to + * migration). We set IFF_DEPRECATED too since the kernel + * will set it automatically when setting IFF_NOFAILOVER, + * causing our lif_flags value to grow stale. + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + lifr.lifr_flags |= IFF_DHCPRUNNING; if (ioctl(fd, SIOCSLIFFLAGS, &lifr) == -1) { dhcpmsg(MSG_ERR, "set_lif_dhcp: SIOCSLIFFLAGS for %s", @@ -1207,6 +1274,13 @@ clear_lif_deprecated(dhcp_lif_t *lif) return (B_FALSE); } + /* + * Don't try to clear IFF_DEPRECATED if this is a test address, + * since IPMP's use of IFF_DEPRECATED is not compatible with ours. + */ + if (lifr.lifr_flags & IFF_NOFAILOVER) + return (B_TRUE); + if (!(lifr.lifr_flags & IFF_DEPRECATED)) return (B_TRUE); @@ -1226,16 +1300,19 @@ clear_lif_deprecated(dhcp_lif_t *lif) * * input: dhcp_lif_t *: the logical interface to operate on * in_addr_t: the address the socket will be bound to (in hbo) + * boolean_t: B_TRUE if the address should be brought up (if needed) * output: boolean_t: B_TRUE if the socket was opened successfully. */ boolean_t -open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) +open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo, boolean_t bringup) { const char *errmsg; struct lifreq lifr; int on = 1; uchar_t ttl = 255; + uint32_t ifindex; + dhcp_pif_t *pif = lif->lif_pif; if (lif->lif_sock_ip_fd != -1) { dhcpmsg(MSG_WARNING, "open_ip_lif: socket already open on %s", @@ -1270,7 +1347,7 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) } if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_DHCPINIT_IF, - &lif->lif_pif->pif_index, sizeof (int)) == -1) { + &pif->pif_index, sizeof (int)) == -1) { errmsg = "cannot set IP_DHCPINIT_IF"; goto failure; } @@ -1288,23 +1365,40 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) goto failure; } - if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_BOUND_IF, - &lif->lif_pif->pif_index, sizeof (int)) == -1) { + ifindex = pif->pif_under_ipmp ? pif->pif_grindex : pif->pif_index; + if (setsockopt(lif->lif_sock_ip_fd, IPPROTO_IP, IP_BOUND_IF, &ifindex, + sizeof (int)) == -1) { errmsg = "cannot set IP_BOUND_IF"; goto failure; } - /* - * Make sure at least one lif on the interface we used in IP_BOUND_IF - * is IFF_UP so that we can send and receive IP packets. - */ (void) strlcpy(lifr.lifr_name, lif->lif_name, LIFNAMSIZ); if (ioctl(v4_sock_fd, SIOCGLIFFLAGS, &lifr) == -1) { errmsg = "cannot get interface flags"; goto failure; } - if (!(lifr.lifr_flags & IFF_UP)) { + /* + * If the lif is part of an interface under IPMP, IFF_NOFAILOVER must + * be set or the kernel will prevent us from setting IFF_DHCPRUNNING + * (since the subsequent IFF_UP would lead to migration). We set + * IFF_DEPRECATED too since the kernel will set it automatically when + * setting IFF_NOFAILOVER, causing our lif_flags value to grow stale. + */ + if (pif->pif_under_ipmp && !(lifr.lifr_flags & IFF_NOFAILOVER)) { + lifr.lifr_flags |= IFF_NOFAILOVER | IFF_DEPRECATED; + if (ioctl(v4_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { + errmsg = "cannot set IFF_NOFAILOVER"; + goto failure; + } + } + lif->lif_flags = lifr.lifr_flags; + + /* + * If this is initial bringup, make sure the address we're acquiring a + * lease on is IFF_UP. + */ + if (bringup && !(lifr.lifr_flags & IFF_UP)) { /* * Start from a clean slate. */ @@ -1330,6 +1424,30 @@ open_ip_lif(dhcp_lif_t *lif, in_addr_t addr_hbo) ((struct sockaddr_in *)&lifr.lifr_addr)->sin_addr.s_addr; } + /* + * Usually, bringing up the address we're acquiring a lease on is + * sufficient to allow packets to be sent and received via the + * IP_BOUND_IF we did earlier. However, if we're acquiring a lease on + * an underlying IPMP interface, the group interface will be used for + * sending and receiving IP packets via IP_BOUND_IF. Thus, ensure at + * least one address on the group interface is IFF_UP. + */ + if (bringup && pif->pif_under_ipmp) { + (void) strlcpy(lifr.lifr_name, pif->pif_grifname, LIFNAMSIZ); + if (ioctl(v4_sock_fd, SIOCGLIFFLAGS, &lifr) == -1) { + errmsg = "cannot get IPMP group interface flags"; + goto failure; + } + + if (!(lifr.lifr_flags & IFF_UP)) { + lifr.lifr_flags |= IFF_UP; + if (ioctl(v4_sock_fd, SIOCSLIFFLAGS, &lifr) == -1) { + errmsg = "cannot bring up IPMP group interface"; + goto failure; + } + } + } + lif->lif_packet_id = iu_register_event(eh, lif->lif_sock_ip_fd, POLLIN, dhcp_packet_lif, lif); if (lif->lif_packet_id == -1) { diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h index a59e3ea68d..46cf30bedb 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/interface.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef INTERFACE_H #define INTERFACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Interface.[ch] encapsulate all of the agent's knowledge of network * interfaces from the DHCP agent's perspective. See interface.c for @@ -66,6 +64,9 @@ struct dhcp_pif_s { boolean_t pif_running; /* interface is running */ uint_t pif_hold_count; /* reference count */ char pif_name[LIFNAMSIZ]; + char pif_grifname[LIFNAMSIZ]; + uint32_t pif_grindex; /* interface index for pif_grifname */ + boolean_t pif_under_ipmp; /* is an ipmp underlying interface */ }; struct dhcp_lif_s { @@ -182,7 +183,7 @@ dhcp_lif_t *attach_lif(const char *, boolean_t, int *); int set_lif_dhcp(dhcp_lif_t *, boolean_t); void set_lif_deprecated(dhcp_lif_t *); boolean_t clear_lif_deprecated(dhcp_lif_t *); -boolean_t open_ip_lif(dhcp_lif_t *, in_addr_t); +boolean_t open_ip_lif(dhcp_lif_t *, in_addr_t, boolean_t); void close_ip_lif(dhcp_lif_t *); void lif_mark_decline(dhcp_lif_t *, const char *); boolean_t schedule_lif_timer(dhcp_lif_t *, dhcp_timer_t *, diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c index 8a32b55ea5..a763530436 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/packet.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <string.h> #include <sys/types.h> #include <stdlib.h> @@ -970,7 +968,10 @@ send_pkt_internal(dhcp_smach_t *dsmp) ipi6->ipi6_addr = lif->lif_v6addr; else ipi6->ipi6_addr = my_in6addr_any; - ipi6->ipi6_ifindex = lif->lif_pif->pif_index; + if (lif->lif_pif->pif_under_ipmp) + ipi6->ipi6_ifindex = lif->lif_pif->pif_grindex; + else + ipi6->ipi6_ifindex = lif->lif_pif->pif_index; cmsg->cmsg_len = (char *)(ipi6 + 1) - (char *)cmsg; /* diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c index a8c05de986..78da07aebf 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/request.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * REQUESTING state of the client state machine. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <string.h> #include <search.h> @@ -1008,7 +1006,8 @@ dhcp_acknak_global(iu_eh_t *ehp, int fd, short events, iu_event_id_t id, for (dsmp = lookup_smach_by_xid(xid, NULL, isv6); dsmp != NULL; dsmp = lookup_smach_by_xid(xid, dsmp, isv6)) { pif = dsmp->dsm_lif->lif_pif; - if (pif->pif_index == plp->ifindex) + if (pif->pif_index == plp->ifindex || + pif->pif_under_ipmp && pif->pif_grindex == plp->ifindex) break; } diff --git a/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c b/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c index 9ae7fd7aba..852b428551 100644 --- a/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c +++ b/usr/src/cmd/cmd-inet/sbin/dhcpagent/states.c @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * This module contains core functions for managing DHCP state machine * instances. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <search.h> #include <string.h> @@ -151,7 +149,7 @@ insert_smach(dhcp_lif_t *lif, int *error) /* * With IPv4 DHCP, we use a socket per lif. */ - if (!open_ip_lif(lif, INADDR_ANY)) { + if (!open_ip_lif(lif, INADDR_ANY, B_TRUE)) { dhcpmsg(MSG_ERR, "unable to open socket for %s", lif->lif_name); /* This will also dispose of the LIF */ @@ -696,14 +694,15 @@ set_smach_state(dhcp_smach_t *dsmp, DHCPSTATE state) if (is_bound_state(dsmp->dsm_state)) { if (!is_bound_state(state)) { close_ip_lif(lif); - if (!open_ip_lif(lif, INADDR_ANY)) + if (!open_ip_lif(lif, INADDR_ANY, + B_FALSE)) return (B_FALSE); } } else { if (is_bound_state(state)) { close_ip_lif(lif); if (!open_ip_lif(lif, - ntohl(lif->lif_addr))) + ntohl(lif->lif_addr), B_FALSE)) return (B_FALSE); } } @@ -952,11 +951,14 @@ no_specified_id: * unable to parse it. We need to determine if a Client ID is required * and, if so, generate one. * - * If it's IPv4 and not a logical interface, then we need to preserve - * backward-compatibility by avoiding new-fangled DUID/IAID - * construction. + * If it's IPv4, not in an IPMP group, and not a logical interface, + * then we need to preserve backward-compatibility by avoiding + * new-fangled DUID/IAID construction. (Note: even for IPMP test + * addresses, we construct a DUID/IAID since we may renew a lease for + * an IPMP test address on any functioning IP interface in the group.) */ - if (!pif->pif_isv6 && strchr(dsmp->dsm_name, ':') == NULL) { + if (!pif->pif_isv6 && pif->pif_grifname[0] == '\0' && + strchr(dsmp->dsm_name, ':') == NULL) { if (pif->pif_hwtype == ARPHRD_IB) { /* * This comes from the DHCP over IPoIB specification. diff --git a/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c b/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c index 47e1202b32..d73722cc55 100644 --- a/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c +++ b/usr/src/cmd/cmd-inet/sbin/ifparse/ifparse.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -8,8 +8,6 @@ * specifies the terms and conditions for redistribution. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Ifparse splits up an ifconfig command line, and was written for use * with the networking boot scripts; see $SRC/cmd/svc/shell/net_include.sh @@ -184,6 +182,7 @@ struct cmd { { "auto-revarp", 0, AF_INET, PARSEFIXED}, { "plumb", 0, AF_ANY, PARSENOW }, { "unplumb", 0, AF_ANY, PARSENOW }, + { "ipmp", 0, AF_ANY, PARSELOG0 }, { "subnet", NEXTARG, AF_ANY, 0 }, { "token", NEXTARG, AF_INET6, PARSELOG0 }, { "tsrc", NEXTARG, AF_ANY, PARSELOG0 }, diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c index b9a02b54e7..2d115e221b 100644 --- a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c +++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,8 +29,6 @@ * MROUTING Revision 3.5 */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * simple netstat based on snmp/mib-2 interface to the TCP/IP stack * @@ -221,6 +219,7 @@ static char *plural(int n); static char *pluraly(int n); static char *plurales(int n); static void process_filter(char *arg); +static char *ifindex2str(uint_t, char *); static boolean_t family_selected(int family); static void usage(char *); @@ -680,8 +679,14 @@ mibget(int sd) tor->OPT_offset = sizeof (struct T_optmgmt_req); tor->OPT_length = sizeof (struct opthdr); tor->MGMT_flags = T_CURRENT; + + + /* + * Note: we use the special level value below so that IP will return + * us information concerning IRE_MARK_TESTHIDDEN routes. + */ req = (struct opthdr *)&tor[1]; - req->level = MIB2_IP; /* any MIB2_xxx value ok here */ + req->level = EXPER_IP_AND_TESTHIDDEN; req->name = 0; req->len = 0; @@ -712,7 +717,7 @@ mibget(int sd) stderr); i = 0; for (last_item = first_item; last_item; - last_item = last_item->next_item) + last_item = last_item->next_item) (void) printf("%d %4d %5d %d\n", ++i, last_item->group, @@ -1707,19 +1712,19 @@ mib_get_constants(mib_item_t *item) ipRouteAttributeSize = ip->ipRouteAttributeSize; transportMLPSize = ip->transportMLPSize; assert(IS_P2ALIGNED(ipAddrEntrySize, - sizeof (mib2_ipAddrEntry_t *)) && - IS_P2ALIGNED(ipRouteEntrySize, - sizeof (mib2_ipRouteEntry_t *)) && - IS_P2ALIGNED(ipNetToMediaEntrySize, - sizeof (mib2_ipNetToMediaEntry_t *)) && - IS_P2ALIGNED(ipMemberEntrySize, - sizeof (ip_member_t *)) && - IS_P2ALIGNED(ipGroupSourceEntrySize, - sizeof (ip_grpsrc_t *)) && - IS_P2ALIGNED(ipRouteAttributeSize, - sizeof (mib2_ipAttributeEntry_t *)) && - IS_P2ALIGNED(transportMLPSize, - sizeof (mib2_transportMLPEntry_t *))); + sizeof (mib2_ipAddrEntry_t *))); + assert(IS_P2ALIGNED(ipRouteEntrySize, + sizeof (mib2_ipRouteEntry_t *))); + assert(IS_P2ALIGNED(ipNetToMediaEntrySize, + sizeof (mib2_ipNetToMediaEntry_t *))); + assert(IS_P2ALIGNED(ipMemberEntrySize, + sizeof (ip_member_t *))); + assert(IS_P2ALIGNED(ipGroupSourceEntrySize, + sizeof (ip_grpsrc_t *))); + assert(IS_P2ALIGNED(ipRouteAttributeSize, + sizeof (mib2_ipAttributeEntry_t *))); + assert(IS_P2ALIGNED(transportMLPSize, + sizeof (mib2_transportMLPEntry_t *))); break; } case EXPER_DVMRP: { @@ -1728,8 +1733,9 @@ mib_get_constants(mib_item_t *item) vifctlSize = mrts->mrts_vifctlSize; mfcctlSize = mrts->mrts_mfcctlSize; assert(IS_P2ALIGNED(vifctlSize, - sizeof (struct vifclt *)) && - IS_P2ALIGNED(mfcctlSize, sizeof (struct mfcctl *))); + sizeof (struct vifclt *))); + assert(IS_P2ALIGNED(mfcctlSize, + sizeof (struct mfcctl *))); break; } case MIB2_IP6: { @@ -1745,17 +1751,17 @@ mib_get_constants(mib_item_t *item) ipv6GroupSourceEntrySize = ip6->ipv6GroupSourceEntrySize; assert(IS_P2ALIGNED(ipv6IfStatsEntrySize, - sizeof (mib2_ipv6IfStatsEntry_t *)) && - IS_P2ALIGNED(ipv6AddrEntrySize, - sizeof (mib2_ipv6AddrEntry_t *)) && - IS_P2ALIGNED(ipv6RouteEntrySize, - sizeof (mib2_ipv6RouteEntry_t *)) && - IS_P2ALIGNED(ipv6NetToMediaEntrySize, - sizeof (mib2_ipv6NetToMediaEntry_t *)) && - IS_P2ALIGNED(ipv6MemberEntrySize, - sizeof (ipv6_member_t *)) && - IS_P2ALIGNED(ipv6GroupSourceEntrySize, - sizeof (ipv6_grpsrc_t *))); + sizeof (mib2_ipv6IfStatsEntry_t *))); + assert(IS_P2ALIGNED(ipv6AddrEntrySize, + sizeof (mib2_ipv6AddrEntry_t *))); + assert(IS_P2ALIGNED(ipv6RouteEntrySize, + sizeof (mib2_ipv6RouteEntry_t *))); + assert(IS_P2ALIGNED(ipv6NetToMediaEntrySize, + sizeof (mib2_ipv6NetToMediaEntry_t *))); + assert(IS_P2ALIGNED(ipv6MemberEntrySize, + sizeof (ipv6_member_t *))); + assert(IS_P2ALIGNED(ipv6GroupSourceEntrySize, + sizeof (ipv6_grpsrc_t *))); break; } case MIB2_ICMP6: { @@ -1774,9 +1780,9 @@ mib_get_constants(mib_item_t *item) tcpConnEntrySize = tcp->tcpConnTableSize; tcp6ConnEntrySize = tcp->tcp6ConnTableSize; assert(IS_P2ALIGNED(tcpConnEntrySize, - sizeof (mib2_tcpConnEntry_t *)) && - IS_P2ALIGNED(tcp6ConnEntrySize, - sizeof (mib2_tcp6ConnEntry_t *))); + sizeof (mib2_tcpConnEntry_t *))); + assert(IS_P2ALIGNED(tcp6ConnEntrySize, + sizeof (mib2_tcp6ConnEntry_t *))); break; } case MIB2_UDP: { @@ -1785,9 +1791,9 @@ mib_get_constants(mib_item_t *item) udpEntrySize = udp->udpEntrySize; udp6EntrySize = udp->udp6EntrySize; assert(IS_P2ALIGNED(udpEntrySize, - sizeof (mib2_udpEntry_t *)) && - IS_P2ALIGNED(udp6EntrySize, - sizeof (mib2_udp6Entry_t *))); + sizeof (mib2_udpEntry_t *))); + assert(IS_P2ALIGNED(udp6EntrySize, + sizeof (mib2_udp6Entry_t *))); break; } case MIB2_SCTP: { @@ -1843,7 +1849,6 @@ stat_report(mib_item_t *item) { int jtemp = 0; char ifname[LIFNAMSIZ + 1]; - char *ifnamep; /* 'for' loop 1: */ for (; item; item = item->next_item) { @@ -1891,12 +1896,10 @@ stat_report(mib_item_t *item) bzero(&sum6, sizeof (sum6)); /* 'for' loop 2a: */ for (ip6 = (mib2_ipv6IfStatsEntry_t *)item->valp; - (char *)ip6 < (char *)item->valp - + item->length; + (char *)ip6 < (char *)item->valp + item->length; /* LINTED: (note 1) */ ip6 = (mib2_ipv6IfStatsEntry_t *)((char *)ip6 + ipv6IfStatsEntrySize)) { - if (ip6->ipv6IfIndex == 0) { /* * The "unknown interface" ip6 @@ -1905,19 +1908,10 @@ stat_report(mib_item_t *item) sum_ip6_stats(ip6, &sum6); continue; /* 'for' loop 2a */ } - ifnamep = if_indextoname( - ip6->ipv6IfIndex, - ifname); - if (ifnamep == NULL) { - (void) printf( - "Invalid ifindex %d\n", - ip6->ipv6IfIndex); - continue; /* 'for' loop 2a */ - } - if (Aflag) { (void) printf("\nIPv6 for %s\n", - ifnamep); + ifindex2str(ip6->ipv6IfIndex, + ifname)); print_ip6_stats(ip6); } sum_ip6_stats(ip6, &sum6); @@ -1935,15 +1929,10 @@ stat_report(mib_item_t *item) break; bzero(&sum6, sizeof (sum6)); /* 'for' loop 2b: */ - for (icmp6 = - (mib2_ipv6IfIcmpEntry_t *)item->valp; - (char *)icmp6 < (char *)item->valp - + item->length; - icmp6 = - /* LINTED: (note 1) */ - (mib2_ipv6IfIcmpEntry_t *)((char *)icmp6 - + ipv6IfIcmpEntrySize)) { - + for (icmp6 = (mib2_ipv6IfIcmpEntry_t *)item->valp; + (char *)icmp6 < (char *)item->valp + item->length; + icmp6 = (void *)((char *)icmp6 + + ipv6IfIcmpEntrySize)) { if (icmp6->ipv6IfIcmpIfIndex == 0) { /* * The "unknown interface" icmp6 @@ -1952,19 +1941,10 @@ stat_report(mib_item_t *item) sum_icmp6_stats(icmp6, &sum6); continue; /* 'for' loop 2b: */ } - ifnamep = if_indextoname( - icmp6->ipv6IfIcmpIfIndex, ifname); - if (ifnamep == NULL) { - (void) printf( - "Invalid ifindex %d\n", - icmp6->ipv6IfIcmpIfIndex); - continue; /* 'for' loop 2b: */ - } - if (Aflag) { - (void) printf( - "\nICMPv6 for %s\n", - ifnamep); + (void) printf("\nICMPv6 for %s\n", + ifindex2str( + icmp6->ipv6IfIcmpIfIndex, ifname)); print_icmp6_stats(icmp6); } sum_icmp6_stats(icmp6, &sum6); @@ -2369,51 +2349,49 @@ print_mrt_stats(struct mrtstat *mrts) { (void) puts("DVMRP multicast routing:"); (void) printf(" %10u hit%s - kernel forwarding cache hits\n", - mrts->mrts_mfc_hits, PLURAL(mrts->mrts_mfc_hits)); + mrts->mrts_mfc_hits, PLURAL(mrts->mrts_mfc_hits)); (void) printf(" %10u miss%s - kernel forwarding cache misses\n", - mrts->mrts_mfc_misses, PLURALES(mrts->mrts_mfc_misses)); + mrts->mrts_mfc_misses, PLURALES(mrts->mrts_mfc_misses)); (void) printf(" %10u packet%s potentially forwarded\n", - mrts->mrts_fwd_in, PLURAL(mrts->mrts_fwd_in)); + mrts->mrts_fwd_in, PLURAL(mrts->mrts_fwd_in)); (void) printf(" %10u packet%s actually sent out\n", - mrts->mrts_fwd_out, PLURAL(mrts->mrts_fwd_out)); + mrts->mrts_fwd_out, PLURAL(mrts->mrts_fwd_out)); (void) printf(" %10u upcall%s - upcalls made to mrouted\n", - mrts->mrts_upcalls, PLURAL(mrts->mrts_upcalls)); + mrts->mrts_upcalls, PLURAL(mrts->mrts_upcalls)); (void) printf(" %10u packet%s not sent out due to lack of resources\n", - mrts->mrts_fwd_drop, PLURAL(mrts->mrts_fwd_drop)); + mrts->mrts_fwd_drop, PLURAL(mrts->mrts_fwd_drop)); (void) printf(" %10u datagram%s with malformed tunnel options\n", - mrts->mrts_bad_tunnel, PLURAL(mrts->mrts_bad_tunnel)); + mrts->mrts_bad_tunnel, PLURAL(mrts->mrts_bad_tunnel)); (void) printf(" %10u datagram%s with no room for tunnel options\n", - mrts->mrts_cant_tunnel, PLURAL(mrts->mrts_cant_tunnel)); + mrts->mrts_cant_tunnel, PLURAL(mrts->mrts_cant_tunnel)); (void) printf(" %10u datagram%s arrived on wrong interface\n", - mrts->mrts_wrong_if, PLURAL(mrts->mrts_wrong_if)); + mrts->mrts_wrong_if, PLURAL(mrts->mrts_wrong_if)); (void) printf(" %10u datagram%s dropped due to upcall Q overflow\n", - mrts->mrts_upq_ovflw, PLURAL(mrts->mrts_upq_ovflw)); + mrts->mrts_upq_ovflw, PLURAL(mrts->mrts_upq_ovflw)); (void) printf(" %10u datagram%s cleaned up by the cache\n", - mrts->mrts_cache_cleanups, PLURAL(mrts->mrts_cache_cleanups)); + mrts->mrts_cache_cleanups, PLURAL(mrts->mrts_cache_cleanups)); (void) printf(" %10u datagram%s dropped selectively by ratelimiter\n", - mrts->mrts_drop_sel, PLURAL(mrts->mrts_drop_sel)); + mrts->mrts_drop_sel, PLURAL(mrts->mrts_drop_sel)); (void) printf(" %10u datagram%s dropped - bucket Q overflow\n", - mrts->mrts_q_overflow, PLURAL(mrts->mrts_q_overflow)); + mrts->mrts_q_overflow, PLURAL(mrts->mrts_q_overflow)); (void) printf(" %10u datagram%s dropped - larger than bkt size\n", - mrts->mrts_pkt2large, PLURAL(mrts->mrts_pkt2large)); + mrts->mrts_pkt2large, PLURAL(mrts->mrts_pkt2large)); (void) printf("\nPIM multicast routing:\n"); (void) printf(" %10u datagram%s dropped - bad version number\n", - mrts->mrts_pim_badversion, PLURAL(mrts->mrts_pim_badversion)); + mrts->mrts_pim_badversion, PLURAL(mrts->mrts_pim_badversion)); (void) printf(" %10u datagram%s dropped - bad checksum\n", - mrts->mrts_pim_rcv_badcsum, PLURAL(mrts->mrts_pim_rcv_badcsum)); + mrts->mrts_pim_rcv_badcsum, PLURAL(mrts->mrts_pim_rcv_badcsum)); (void) printf(" %10u datagram%s dropped - bad register packets\n", - mrts->mrts_pim_badregisters, - PLURAL(mrts->mrts_pim_badregisters)); + mrts->mrts_pim_badregisters, PLURAL(mrts->mrts_pim_badregisters)); (void) printf( - " %10u datagram%s potentially forwarded - register packets\n", - mrts->mrts_pim_regforwards, PLURAL(mrts->mrts_pim_regforwards)); + " %10u datagram%s potentially forwarded - register packets\n", + mrts->mrts_pim_regforwards, PLURAL(mrts->mrts_pim_regforwards)); (void) printf(" %10u datagram%s dropped - register send drops\n", - mrts->mrts_pim_regsend_drops, - PLURAL(mrts->mrts_pim_regsend_drops)); + mrts->mrts_pim_regsend_drops, PLURAL(mrts->mrts_pim_regsend_drops)); (void) printf(" %10u datagram%s dropped - packet malformed\n", - mrts->mrts_pim_malformed, PLURAL(mrts->mrts_pim_malformed)); + mrts->mrts_pim_malformed, PLURAL(mrts->mrts_pim_malformed)); (void) printf(" %10u datagram%s dropped - no memory to forward\n", - mrts->mrts_pim_nomemory, PLURAL(mrts->mrts_pim_nomemory)); + mrts->mrts_pim_nomemory, PLURAL(mrts->mrts_pim_nomemory)); } static void @@ -2674,7 +2652,7 @@ if_report(mib_item_t *item, char *matchname, "Ierrs", "Opkts", "Oerrs", "Collis", "Queue"); - first = B_FALSE; + first = B_FALSE; } if_report_ip4(ap, ifname, logintname, &stat, B_TRUE); @@ -2717,7 +2695,7 @@ if_report(mib_item_t *item, char *matchname, + item->length; ap++) { (void) octetstr(&ap->ipAdEntIfIndex, - 'a', ifname, sizeof (ifname)); + 'a', ifname, sizeof (ifname)); (void) strtok(ifname, ":"); if (matchname) { @@ -3387,7 +3365,7 @@ dhcp_walk_interfaces(uint_t flags_on, uint_t flags_off, int af, */ (void) memset(&lifn, 0, sizeof (lifn)); lifn.lifn_family = af; - lifn.lifn_flags = LIFC_ALLZONES | LIFC_NOXMIT; + lifn.lifn_flags = LIFC_ALLZONES | LIFC_NOXMIT | LIFC_UNDER_IPMP; if (ioctl(sock_fd, SIOCGLIFNUM, &lifn) == -1) n_ifs = LIFN_GUARD_VALUE; else @@ -3471,7 +3449,6 @@ group_report(mib_item_t *item) ip_grpsrc_t *ips; ipv6_member_t *ipmp6; ipv6_grpsrc_t *ips6; - char *ifnamep; boolean_t first, first_src; /* 'for' loop 1: */ @@ -3604,7 +3581,7 @@ group_report(mib_item_t *item) (char *)ipmp6 < (char *)v6grp->valp + v6grp->length; /* LINTED: (note 1) */ ipmp6 = (ipv6_member_t *)((char *)ipmp6 + - ipv6MemberEntrySize)) { + ipv6MemberEntrySize)) { if (first) { (void) puts("Group Memberships: " "IPv6"); @@ -3615,15 +3592,8 @@ group_report(mib_item_t *item) first = B_FALSE; } - ifnamep = if_indextoname( - ipmp6->ipv6GroupMemberIfIndex, ifname); - if (ifnamep == NULL) { - (void) printf("Invalid ifindex %d\n", - ipmp6->ipv6GroupMemberIfIndex); - continue; - } (void) printf("%-5s %-27s %5u\n", - ifnamep, + ifindex2str(ipmp6->ipv6GroupMemberIfIndex, ifname), pr_addr6(&ipmp6->ipv6GroupMemberAddress, abuf, sizeof (abuf)), ipmp6->ipv6GroupMemberRefCnt); @@ -3784,7 +3754,6 @@ ndp_report(mib_item_t *item) char xbuf[STR_EXPAND * OCTET_LENGTH + 1]; mib2_ipv6NetToMediaEntry_t *np6; char ifname[LIFNAMSIZ + 1]; - char *ifnamep; boolean_t first; if (!(family_selected(AF_INET6))) @@ -3820,13 +3789,6 @@ ndp_report(mib_item_t *item) first = B_FALSE; } - ifnamep = if_indextoname(np6->ipv6NetToMediaIfIndex, - ifname); - if (ifnamep == NULL) { - (void) printf("Invalid ifindex %d\n", - np6->ipv6NetToMediaIfIndex); - continue; /* 'for' loop 2 */ - } switch (np6->ipv6NetToMediaState) { case ND_INCOMPLETE: state = "INCOMPLETE"; @@ -3865,7 +3827,7 @@ ndp_report(mib_item_t *item) break; } (void) printf("%-5s %-17s %-7s %-12s %-27s\n", - ifnamep, + ifindex2str(np6->ipv6NetToMediaIfIndex, ifname), octetstr(&np6->ipv6NetToMediaPhysAddress, 'h', xbuf, sizeof (xbuf)), type, @@ -4472,7 +4434,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, (void) printf("%-27s %-27s %-5s %5u%c %5u %3u " "%-5s %6u %6u %s\n", pr_prefix6(&rp6->ipv6RouteDest, - rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), + rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), IN6_IS_ADDR_UNSPECIFIED(&rp6->ipv6RouteNextHop) ? " --" : pr_addr6(&rp6->ipv6RouteNextHop, gwbuf, sizeof (gwbuf)), @@ -4489,7 +4451,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, } else { (void) printf("%-27s %-27s %-5s %3u %7u %-5s %s\n", pr_prefix6(&rp6->ipv6RouteDest, - rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), + rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), IN6_IS_ADDR_UNSPECIFIED(&rp6->ipv6RouteNextHop) ? " --" : pr_addr6(&rp6->ipv6RouteNextHop, gwbuf, sizeof (gwbuf)), @@ -4690,9 +4652,9 @@ tcp_report_item_v4(const mib2_tcpConnEntry_t *tp, boolean_t first, (void) printf("%-20s\n%-20s %5u %08x %08x %5u %08x %08x " "%5u %5u %s\n", pr_ap(tp->tcpConnLocalAddress, - tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), + tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap(tp->tcpConnRemAddress, - tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), + tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), tp->tcpConnEntryInfo.ce_swnd, tp->tcpConnEntryInfo.ce_snxt, tp->tcpConnEntryInfo.ce_suna, @@ -4710,9 +4672,9 @@ tcp_report_item_v4(const mib2_tcpConnEntry_t *tp, boolean_t first, (void) printf("%-20s %-20s %5u %6d %5u %6d %s\n", pr_ap(tp->tcpConnLocalAddress, - tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), + tp->tcpConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap(tp->tcpConnRemAddress, - tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), + tp->tcpConnRemPort, "tcp", fname, sizeof (fname)), tp->tcpConnEntryInfo.ce_swnd, (sq >= 0) ? sq : 0, tp->tcpConnEntryInfo.ce_rwnd, @@ -4756,9 +4718,9 @@ tcp_report_item_v6(const mib2_tcp6ConnEntry_t *tp6, boolean_t first, (void) printf("%-33s\n%-33s %5u %08x %08x %5u %08x %08x " "%5u %5u %-11s %s\n", pr_ap6(&tp6->tcp6ConnLocalAddress, - tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), + tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap6(&tp6->tcp6ConnRemAddress, - tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), + tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), tp6->tcp6ConnEntryInfo.ce_swnd, tp6->tcp6ConnEntryInfo.ce_snxt, tp6->tcp6ConnEntryInfo.ce_suna, @@ -4777,9 +4739,9 @@ tcp_report_item_v6(const mib2_tcp6ConnEntry_t *tp6, boolean_t first, (void) printf("%-33s %-33s %5u %6d %5u %6d %-11s %s\n", pr_ap6(&tp6->tcp6ConnLocalAddress, - tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), + tp6->tcp6ConnLocalPort, "tcp", lname, sizeof (lname)), pr_ap6(&tp6->tcp6ConnRemAddress, - tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), + tp6->tcp6ConnRemPort, "tcp", fname, sizeof (fname)), tp6->tcp6ConnEntryInfo.ce_swnd, (sq >= 0) ? sq : 0, tp6->tcp6ConnEntryInfo.ce_rwnd, @@ -5112,7 +5074,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, * displaying. */ switch (type) { - case MIB2_SCTP_ADDR_V4: + case MIB2_SCTP_ADDR_V4: /* v4 */ v6addr = *addr; @@ -5124,7 +5086,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, } break; - case MIB2_SCTP_ADDR_V6: + case MIB2_SCTP_ADDR_V6: /* v6 */ if (port > 0) { (void) pr_ap6(addr, port, "sctp", name, namelen); @@ -5133,7 +5095,7 @@ sctp_pr_addr(int type, char *name, int namelen, const in6_addr_t *addr, } break; - default: + default: (void) snprintf(name, namelen, "<unknown addr type>"); break; } @@ -5379,7 +5341,7 @@ mrt_report(mib_item_t *item) case EXPER_DVMRP_MRT: if (Dflag) (void) printf("%u records for ipMfcTable:\n", - item->length/sizeof (struct vifctl)); + item->length/sizeof (struct vifctl)); if (item->length/sizeof (struct vifctl) == 0) { (void) puts("\nMulticast Forwarding Cache is " "empty"); @@ -5402,10 +5364,10 @@ mrt_report(mib_item_t *item) abuf, sizeof (abuf))); (void) printf("%-15.15s %6s %3u ", pr_net(mfccp->mfcc_mcastgrp.s_addr, - mfccp->mfcc_mcastgrp.s_addr, - abuf, sizeof (abuf)), + mfccp->mfcc_mcastgrp.s_addr, + abuf, sizeof (abuf)), pktscale((int)mfccp->mfcc_pkt_cnt), - mfccp->mfcc_parent); + mfccp->mfcc_parent); for (vifi = 0; vifi < MAXVIFS; ++vifi) { if (mfccp->mfcc_ttls[vifi]) { @@ -5468,7 +5430,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) strncmp(ksp->ks_name, "streams_dblk", 12) == 0) { (void) safe_kstat_read(kc, ksp, NULL); total_buf_inuse -= - kstat_named_value(ksp, "buf_constructed"); + kstat_named_value(ksp, "buf_constructed"); continue; /* 'for' loop 1 */ } @@ -5501,7 +5463,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) if (buf_size == 0) { (void) printf("%-22s [couldn't find statistics for %s]\n", - title, name); + title, name); return; } @@ -5511,7 +5473,7 @@ kmem_cache_stats(char *title, char *name, int prefix, int64_t *total_bytes) (void) snprintf(buf, sizeof (buf), "%s", title); (void) printf("%-22s %6d %9d %11lld %11d\n", buf, - total_buf_inuse, total_buf_max, total_alloc, total_alloc_fail); + total_buf_inuse, total_buf_max, total_alloc, total_alloc_fail); } static void @@ -5534,7 +5496,7 @@ m_report(void) kmem_cache_stats("qband", "qband_cache", 0, &total_bytes); (void) printf("\n%lld Kbytes allocated for streams data\n", - total_bytes / 1024); + total_bytes / 1024); (void) putchar('\n'); (void) fflush(stdout); @@ -5967,7 +5929,7 @@ portname(uint_t port, char *proto, char *dst, uint_t dstlen) sp = getservbyport(htons(port), proto); if (sp || port == 0) (void) snprintf(dst, dstlen, "%.*s", MAXHOSTNAMELEN, - sp ? sp->s_name : "*"); + sp ? sp->s_name : "*"); else (void) snprintf(dst, dstlen, "%d", port); dst[dstlen - 1] = 0; @@ -6161,8 +6123,8 @@ process_filter(char *arg) */ if (hp->h_addr_list[0] != NULL && /* LINTED: (note 1) */ - IN6_IS_ADDR_V4MAPPED((in6_addr_t - *)hp->h_addr_list[0])) { + IN6_IS_ADDR_V4MAPPED((in6_addr_t *) + hp->h_addr_list[0])) { maxv = IP_ABITS; } else { maxv = IPV6_ABITS; @@ -6226,6 +6188,21 @@ family_selected(int family) } /* + * Convert the interface index to a string using the buffer `ifname', which + * must be at least LIFNAMSIZ bytes. We first try to map it to name. If that + * fails (e.g., because we're inside a zone and it does not have access to + * interface for the index in question), just return "if#<num>". + */ +static char * +ifindex2str(uint_t ifindex, char *ifname) +{ + if (if_indextoname(ifindex, ifname) == NULL) + (void) snprintf(ifname, LIFNAMSIZ, "if#%d", ifindex); + + return (ifname); +} + +/* * print the usage line */ static void diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile index f0c4c03250..f3ce9fae4b 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/Makefile @@ -19,51 +19,58 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# - -PROG = in.mpathd -OBJS = mpd_tables.o mpd_main.o mpd_probe.o -SRCS = $(OBJS:%.o=%.c) -DEFAULTFILES = mpathd.dfl +PROG = in.mpathd +ROOTFS_PROG = $(PROG) +OBJS = mpd_tables.o mpd_main.o mpd_probe.o +SRCS = $(OBJS:%.o=%.c) +DEFAULTFILES = mpathd.dfl include ../../../Makefile.cmd -POFILE = $(PROG).po -POFILES = $(SRCS:%.c=%.po) +ROOTCMDDIR = $(ROOT)/lib/inet + +POFILE = $(PROG).po +POFILES = $(SRCS:%.c=%.po) -C99MODE= $(C99_ENABLE) +C99MODE = $(C99_ENABLE) # # We need access to the ancillary data features which are only available # via the SUS standards. Further, C99 support requires SUSv3 or higher. # CPPFLAGS += -D_XOPEN_SOURCE=600 -D__EXTENSIONS__ -LDLIBS += -lsocket -lnsl -lsysevent -lnvpair -lipmp -lc +LDLIBS += -lsocket -lnsl -lsysevent -lnvpair -lipmp -linetutil -ldlpi +LINTFLAGS += -erroff=E_INCONS_ARG_DECL2 -erroff=E_INCONS_ARG_USED2 -LINTFLAGS += -erroff=E_FUNC_DECL_VAR_ARG2 -erroff=E_INCONS_VAL_TYPE_DECL2 \ - -erroff=E_FUNC_USED_VAR_ARG2 -erroff=E_INCONS_ARG_DECL2 \ - -erroff=E_NAME_USED_NOT_DEF2 -erroff=E_INCONS_ARG_USED2 \ - -errtags=yes +# +# Instrument in.mpathd with CTF data to ease debugging. +# +CTFCONVERT_HOOK = && $(CTFCONVERT_O) +CTFMERGE_HOOK = && $(CTFMERGE) -L VERSION -o $@ $(OBJS) +$(OBJS) := CFLAGS += $(CTF_FLAGS) .KEEP_STATE: all: $(PROG) $(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDLIBS) + $(LINK.c) -o $@ $(OBJS) $(LDLIBS) $(CTFMERGE_HOOK) $(POST_PROCESS) include ../Makefile.lib +$(ROOTLIBINETPROG): + $(RM) $@; $(SYMLINK) ../../../lib/inet/$(PROG) $@ + $(ROOTSBINPROG): - $(RM) $@; $(SYMLINK) ../usr/lib/inet/$(PROG) $@ + $(RM) $@; $(SYMLINK) ../lib/inet/$(PROG) $@ -install: all $(ROOTLIBINETPROG) $(ROOTSBINPROG) $(ROOTETCDEFAULTFILES) +install: all $(ROOTLIBINETPROG) $(ROOTSBINPROG) $(ROOTCMD) \ + $(ROOTETCDEFAULTFILES) clean: $(RM) $(OBJS) diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h index 9b07e2a7a3..e7cb096bf7 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_defs.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _MPD_DEFS_H #define _MPD_DEFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -78,12 +76,13 @@ extern "C" { #include <locale.h> #include <deflt.h> +#include <libdlpi.h> +#include <libinetutil.h> #include <libnvpair.h> #include <libsysevent.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/ipmp.h> -#include <zone.h> #include <ipmp_mpathd.h> #include <ipmp_query_impl.h> @@ -92,7 +91,7 @@ extern "C" { /* Debug flags */ #define D_ALL 0xffff /* enable all debug */ #define D_PROBE 0x0001 /* probe mechanism */ -#define D_FAILOVER 0x0002 /* failover mechanism */ +#define D_FAILREP 0x0002 /* failure/repair mechanism */ #define D_PHYINT 0x0004 /* phyint table */ #define D_LOGINT 0x0008 /* logint table */ #define D_TARGET 0x0010 /* target table */ @@ -199,10 +198,8 @@ extern int user_failure_detection_time; /* User specified fdt */ extern int ifsock_v4; /* IPv4 socket for ioctls */ extern int ifsock_v6; /* IPv6 socket for ioctls */ -extern boolean_t full_scan_required; /* Do full scans */ - extern int debug; /* debug option */ - +extern boolean_t cleanup_started; /* true if we're shutting down */ extern boolean_t handle_link_notifications; /* @@ -212,6 +209,7 @@ extern void timer_schedule(uint_t delay); extern void logmsg(int pri, const char *fmt, ...); extern void logperror(const char *str); extern int poll_add(int fd); +extern int poll_remove(int fd); extern uint64_t getcurrentsec(void); extern uint_t getcurrenttime(void); diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c index aa6a99fb9c..e1e22e12d4 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -46,7 +44,6 @@ static int lsock_v6; /* Listen socket to detect mpathd */ static int mibfd = -1; /* fd to get mib info */ static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ -boolean_t full_scan_required = _B_FALSE; static uint_t last_initifs_time; /* Time when initifs was last run */ static char **argv0; /* Saved for re-exec on SIGHUP */ boolean_t handle_link_notifications = _B_TRUE; @@ -58,10 +55,6 @@ static void check_if_removed(struct phyint_instance *pii); static void select_test_ifs(void); static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); -static void router_add_v4(mib2_ipRouteEntry_t *rp1, - struct in_addr nexthop_v4); -static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, - struct in6_addr nexthop_v6); static void router_add_common(int af, char *ifname, struct in6_addr nexthop); static void init_router_targets(); @@ -74,17 +67,17 @@ static void check_addr_unique(struct phyint_instance *, static void init_host_targets(void); static void dup_host_targets(struct phyint_instance *desired_pii); static void loopback_cmd(int sock, int family); -static int poll_remove(int fd); static boolean_t daemonize(void); static int closefunc(void *, int); static unsigned int process_cmd(int newfd, union mi_commands *mpi); static unsigned int process_query(int fd, mi_query_t *miq); +static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop); static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); static unsigned int send_result(int fd, unsigned int error, int syserror); -struct local_addr *laddr_list = NULL; +addrlist_t *localaddrs; /* * Return the current time in milliseconds (from an arbitrary reference) @@ -153,7 +146,7 @@ retry: /* * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. */ -static int +int poll_remove(int fd) { int i; @@ -205,17 +198,11 @@ pii_process(int af, char *name, struct phyint_instance **pii_p) break; case PI_GROUP_CHANGED: - /* - * The phyint has changed group. - */ - restore_phyint(pii->pii_phyint); - /* FALLTHRU */ - case PI_IFINDEX_CHANGED: /* - * Interface index has changed. Delete and - * recreate the phyint as it is quite likely - * the interface has been unplumbed and replumbed. + * Interface index or group membership has changed. + * Delete the old state and recreate based on the new + * state (it may no longer be in a group). */ pii_other = phyint_inst_other(pii); if (pii_other != NULL) @@ -249,51 +236,26 @@ pii_process(int af, char *name, struct phyint_instance **pii_p) } /* - * This phyint is leaving the group. Try to restore the phyint to its - * initial state. Return the addresses that belong to other group members, - * to the group, and take back any addresses owned by this phyint - */ -void -restore_phyint(struct phyint *pi) -{ - if (pi->pi_group == phyint_anongroup) - return; - - /* - * Move everthing to some other member in the group. - * The phyint has changed group in the kernel. But we - * have yet to do it in our tables. - */ - if (!pi->pi_empty) - (void) try_failover(pi, FAILOVER_TO_ANY); - /* - * Move all addresses owned by 'pi' back to pi, from each - * of the other members of the group - */ - (void) try_failback(pi); -} - -/* * Scan all interfaces to detect changes as well as new and deleted interfaces */ static void initifs() { - int n; + int i, nlifr; int af; char *cp; char *buf; - int numifs; + int sockfd; + uint64_t flags; struct lifnum lifn; struct lifconf lifc; + struct lifreq lifreq; struct lifreq *lifr; struct logint *li; struct phyint_instance *pii; struct phyint_instance *next_pii; - char pi_name[LIFNAMSIZ + 1]; - boolean_t exists; - struct phyint *pi; - struct local_addr *next; + struct phyint_group *pg, *next_pg; + char pi_name[LIFNAMSIZ + 1]; if (debug & D_PHYINT) logdebug("initifs: Scanning interfaces\n"); @@ -301,13 +263,9 @@ initifs() last_initifs_time = getcurrenttime(); /* - * Free the laddr_list before collecting the local addresses. + * Free the existing local address list; we'll build a new list below. */ - while (laddr_list != NULL) { - next = laddr_list->next; - free(laddr_list); - laddr_list = next; - } + addrlist_free(&localaddrs); /* * Mark the interfaces so that we can find phyints and logints @@ -326,122 +284,142 @@ initifs() } } + /* + * As above, mark groups so that we can detect IPMP interfaces which + * have been removed from the kernel. Also, delete the group address + * list since we'll iteratively recreate it below. + */ + for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { + pg->pg_in_use = _B_FALSE; + addrlist_free(&pg->pg_addrs); + } + lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = LIFC_ALLZONES; + lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; +again: if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - logperror("initifs: ioctl (get interface numbers)"); + logperror("initifs: ioctl (get interface count)"); return; } - numifs = lifn.lifn_count; + /* + * Pad the interface count to detect when additional interfaces have + * been configured between SIOCGLIFNUM and SIOCGLIFCONF. + */ + lifn.lifn_count += 4; - buf = (char *)calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { + if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { logperror("initifs: calloc"); return; } lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = LIFC_ALLZONES; - lifc.lifc_len = numifs * sizeof (struct lifreq); + lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; + lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); lifc.lifc_buf = buf; if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - /* - * EINVAL is commonly encountered, when things change - * underneath us rapidly, (eg. at boot, when new interfaces - * are plumbed successively) and the kernel finds the buffer - * size we passed as too small. We will retry again - * when we see the next routing socket msg, or at worst after - * IF_SCAN_INTERVAL ms. - */ - if (errno != EINVAL) { - logperror("initifs: ioctl" - " (get interface configuration)"); - } + logperror("initifs: ioctl (get interface configuration)"); free(buf); return; } - lifr = (struct lifreq *)lifc.lifc_req; - /* - * For each lifreq returned by SIOGGLIFCONF, call pii_process() - * and get the state of the corresponding phyint_instance. If it is - * successful, then call logint_init_from_k() to get the state of the - * logint. + * If every lifr_req slot is taken, then additional interfaces must + * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. + * Recalculate to make sure we didn't miss any interfaces. */ - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { - int sockfd; - struct local_addr *taddr; - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - struct lifreq lifreq; + nlifr = lifc.lifc_len / sizeof (struct lifreq); + if (nlifr >= lifn.lifn_count) { + free(buf); + goto again; + } + /* + * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the + * global list of addresses, phyint groups, phyints, and logints. + */ + for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) { af = lifr->lifr_addr.ss_family; - - /* - * Collect all local addresses. - */ sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; - (void) memset(&lifreq, 0, sizeof (lifreq)); - (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, - sizeof (lifreq.lifr_name)); + (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { if (errno != ENXIO) logperror("initifs: ioctl (SIOCGLIFFLAGS)"); continue; } + flags = lifreq.lifr_flags; + + /* + * If the address is IFF_UP, add it to the local address list. + * (We ignore addresses that aren't IFF_UP since another node + * might legitimately have that address IFF_UP.) + */ + if (flags & IFF_UP) { + (void) addrlist_add(&localaddrs, lifr->lifr_name, flags, + &lifr->lifr_addr); + } /* - * Add the interface address to laddr_list. - * Another node might have the same IP address which is up. - * In that case, it is appropriate to use the address as a - * target, even though it is also configured (but not up) on - * the local system. - * Hence,the interface address is not added to laddr_list - * unless it is IFF_UP. + * If this address is on an IPMP meta-interface, update our + * phyint_group information (either by recording that group + * still exists or creating a new group), and track what + * group the address is part of. */ - if (lifreq.lifr_flags & IFF_UP) { - taddr = malloc(sizeof (struct local_addr)); - if (taddr == NULL) { - logperror("initifs: malloc"); + if (flags & IFF_IPMP) { + if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) { + if (errno != ENXIO) + logperror("initifs: ioctl " + "(SIOCGLIFGROUPNAME)"); continue; } - if (af == AF_INET) { - sin = (struct sockaddr_in *)&lifr->lifr_addr; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, - &taddr->addr); - } else { - sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr; - taddr->addr = sin6->sin6_addr; + + pg = phyint_group_lookup(lifreq.lifr_groupname); + if (pg == NULL) { + pg = phyint_group_create(lifreq.lifr_groupname); + if (pg == NULL) { + logerr("initifs: cannot create group " + "%s\n", lifreq.lifr_groupname); + continue; + } + phyint_group_insert(pg); + } + pg->pg_in_use = _B_TRUE; + + /* + * Add this to the group's list of data addresses. + */ + if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags, + &lifr->lifr_addr)) { + logerr("initifs: insufficient memory to track " + "data address information for %s\n", + lifr->lifr_name); } - taddr->next = laddr_list; - laddr_list = taddr; + continue; } /* - * Need to pass a phyint name to pii_process. Insert the - * null where the ':' IF_SEPARATOR is found in the logical - * name. + * This isn't an address on an IPMP meta-interface, so it's + * either on an underlying interface or not related to any + * group. Update our phyint and logint information (via + * pii_process() and logint_init_from_k()) -- but first, + * convert the logint name to a phyint name so we can call + * pii_process(). */ (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) *cp = '\0'; - exists = pii_process(af, pi_name, &pii); - if (exists) { + if (pii_process(af, pi_name, &pii)) { /* The phyint is fine. So process the logint */ logint_init_from_k(pii, lifr->lifr_name); check_addr_unique(pii, &lifr->lifr_addr); } - } - free(buf); /* - * Scan for phyints and logints that have disappeared from the + * Scan for groups, phyints and logints that have disappeared from the * kernel, and delete them. */ for (pii = phyint_instances; pii != NULL; pii = next_pii) { @@ -449,70 +427,31 @@ initifs() check_if_removed(pii); } + for (pg = phyint_groups; pg != NULL; pg = next_pg) { + next_pg = pg->pg_next; + if (!pg->pg_in_use) { + phyint_group_delete(pg); + continue; + } + /* + * Refresh the group's state. This is necessary since the + * group's state is defined by the set of usable interfaces in + * the group, and an interface is considered unusable if all + * of its addresses are down. When an address goes down/up, + * the RTM_DELADDR/RTM_NEWADDR brings us through here. + */ + phyint_group_refresh_state(pg); + } + /* * Select a test address for sending probes on each phyint instance */ select_test_ifs(); /* - * Handle link up/down notifications from the NICs. + * Handle link up/down notifications. */ process_link_state_changes(); - - for (pi = phyints; pi != NULL; pi = pi->pi_next) { - /* - * If this is a case of group failure, we don't have much - * to do until the group recovers again. - */ - if (GROUP_FAILED(pi->pi_group)) - continue; - - /* - * Try/Retry any pending failovers / failbacks, that did not - * not complete, or that could not be initiated previously. - * This implements the 3 invariants described in the big block - * comment at the beginning of probe.c - */ - if (pi->pi_flags & IFF_INACTIVE) { - if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) - (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); - } else { - struct phyint_instance *pii; - - /* - * Skip LINK UP interfaces which are not capable - * of probing. - */ - pii = pi->pi_v4; - if (pii == NULL || - (LINK_UP(pi) && !PROBE_CAPABLE(pii))) { - pii = pi->pi_v6; - if (pii == NULL || - (LINK_UP(pi) && !PROBE_CAPABLE(pii))) - continue; - } - - /* - * It is possible that the phyint has started - * receiving packets, after it has been marked - * PI_FAILED. Don't initiate failover, if the - * phyint has started recovering. failure_state() - * captures this check. A similar logic is used - * for failback/repair case. - */ - if (pi->pi_state == PI_FAILED && !pi->pi_empty && - (failure_state(pii) == PHYINT_FAILURE)) { - (void) try_failover(pi, FAILOVER_NORMAL); - } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { - if (try_failback(pi) != IPMP_FAILURE) { - (void) change_lif_flags(pi, IFF_FAILED, - _B_FALSE); - /* Per state diagram */ - pi->pi_empty = 0; - } - } - } - } } /* @@ -569,7 +508,7 @@ check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) * The probe socket is closed on each interface instance, and the * interface state set to PI_OFFLINE. */ -static void +void stop_probing(struct phyint *pi) { struct phyint_instance *pii; @@ -631,7 +570,6 @@ select_test_ifs(void) struct logint *li; struct logint *probe_logint; boolean_t target_scan_reqd = _B_FALSE; - struct target *tg; int rating; if (debug & D_PHYINT) @@ -645,8 +583,8 @@ select_test_ifs(void) probe_logint = NULL; /* - * An interface that is offline, should not be probed. - * Offline interfaces should always in PI_OFFLINE state, + * An interface that is offline should not be probed. + * IFF_OFFLINE interfaces should always be PI_OFFLINE * unless some other entity has set the offline flag. */ if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { @@ -659,6 +597,15 @@ select_test_ifs(void) stop_probing(pii->pii_phyint); } continue; + } else { + /* + * If something cleared IFF_OFFLINE (e.g., by accident + * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is + * inherently racy), the phyint may still be offline. + * Just ignore it. + */ + if (pii->pii_phyint->pi_state == PI_OFFLINE) + continue; } li = pii->pii_probe_logint; @@ -776,17 +723,6 @@ select_test_ifs(void) phyint_chstate(pii->pii_phyint, PI_NOTARGETS); } - if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { - tg = pii->pii_targets; - if (tg != NULL) - target_delete(tg); - assert(pii->pii_targets == NULL); - assert(pii->pii_target_next == NULL); - assert(pii->pii_ntargets == 0); - target_create(pii, probe_logint->li_dstaddr, - _B_TRUE); - } - /* * If no targets are currently known for this phyint * we need to call init_router_targets. Since @@ -806,15 +742,16 @@ select_test_ifs(void) } /* - * Check the interface list for any interfaces that are marked - * PI_FAILED but no longer enabled to send probes, and call - * phyint_check_for_repair() to see if the link now indicates that the - * interface should be repaired. Also see the state diagram in + * Scan the interface list for any interfaces that are PI_FAILED or + * PI_NOTARGETS but no longer enabled to send probes, and call + * phyint_check_for_repair() to see if the link state indicates that + * the interface should be repaired. Also see the state diagram in * mpd_probe.c. */ for (pi = phyints; pi != NULL; pi = pi->pi_next) { - if (pi->pi_state == PI_FAILED && - !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { + if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) && + (pi->pi_state == PI_FAILED || + pi->pi_state == PI_NOTARGETS)) { phyint_check_for_repair(pi); } } @@ -875,15 +812,14 @@ check_testconfig(void) pi->pi_v6->pii_probe_logint->li_dupaddr) li = pi->pi_v6->pii_probe_logint; - if (li != NULL) { - if (!pi->pi_duptaddrmsg_printed) { - (void) pr_addr(li->li_phyint_inst->pii_af, - li->li_addr, abuf, sizeof (abuf)); - logerr("Test address %s is not unique in " - "group; disabling probe-based failure " - "detection on %s\n", abuf, pi->pi_name); - pi->pi_duptaddrmsg_printed = 1; - } + if (li != NULL && li->li_dupaddr) { + if (pi->pi_duptaddrmsg_printed) + continue; + logerr("Test address %s is not unique in group; " + "disabling probe-based failure detection on %s\n", + pr_addr(li->li_phyint_inst->pii_af, + li->li_addr, abuf, sizeof (abuf)), pi->pi_name); + pi->pi_duptaddrmsg_printed = 1; continue; } @@ -915,10 +851,10 @@ check_config(void) boolean_t v6_in_group; /* - * All phyints of a group must be homogenous to ensure that - * failover or failback can be done. If any phyint in a group - * has IPv4 plumbed, check that all phyints have IPv4 plumbed. - * Do a similar check for IPv6. + * All phyints of a group must be homogeneous to ensure that they can + * take over for one another. If any phyint in a group has IPv4 + * plumbed, check that all phyints have IPv4 plumbed. Do a similar + * check for IPv6. */ for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { if (pg == phyint_anongroup) @@ -949,9 +885,9 @@ check_config(void) if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { if (!pi->pi_cfgmsg_printed) { - logerr("NIC %s of group %s is" - " not plumbed for IPv4 and may" - " affect failover capability\n", + logerr("IP interface %s in group %s is" + " not plumbed for IPv4, affecting" + " IPv4 connectivity\n", pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 1; @@ -959,9 +895,9 @@ check_config(void) } else if (v6_in_group == _B_TRUE && pi->pi_v6 == NULL) { if (!pi->pi_cfgmsg_printed) { - logerr("NIC %s of group %s is" - " not plumbed for IPv6 and may" - " affect failover capability\n", + logerr("IP interface %s in group %s is" + " not plumbed for IPv6, affecting" + " IPv6 connectivity\n", pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 1; @@ -974,10 +910,10 @@ check_config(void) * error recovery message */ if (pi->pi_cfgmsg_printed) { - logerr("NIC %s is now consistent with " - "group %s and failover capability " - "is restored\n", pi->pi_name, - pi->pi_group->pg_name); + logerr("IP interface %s is now" + " consistent with group %s " + " and connectivity is restored\n", + pi->pi_name, pi->pi_group->pg_name); pi->pi_cfgmsg_printed = 0; } } @@ -1117,8 +1053,8 @@ run_timeouts(void) static int eventpipe_read = -1; /* Used for synchronous signal delivery */ static int eventpipe_write = -1; -static boolean_t cleanup_started = _B_FALSE; - /* Don't write to eventpipe if in cleanup */ +boolean_t cleanup_started = _B_FALSE; /* true if we're going away */ + /* * Ensure that signals are processed synchronously with the rest of * the code by just writing a one character signal number on the pipe. @@ -1228,7 +1164,7 @@ in_signal(int fd) "Number of probes sent %lld\n" "Number of probe acks received %lld\n" "Number of probes/acks lost %lld\n" - "Number of valid unacknowled probes %lld\n" + "Number of valid unacknowledged probes %lld\n" "Number of ambiguous probe acks received %lld\n", AF_STR(pii->pii_af), pii->pii_name, sent, acked, lost, unacked, unknown); @@ -1321,12 +1257,20 @@ setup_rtsock(int af) { int s; int flags; + int aware = RTAW_UNDER_IPMP; s = socket(PF_ROUTE, SOCK_RAW, af); if (s == -1) { logperror("setup_rtsock: socket PF_ROUTE"); exit(1); } + + if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) { + logperror("setup_rtsock: setsockopt RT_AWARE"); + (void) close(s); + exit(1); + } + if ((flags = fcntl(s, F_GETFL, 0)) < 0) { logperror("setup_rtsock: fcntl F_GETFL"); (void) close(s); @@ -1347,8 +1291,7 @@ setup_rtsock(int af) /* * Process an RTM_IFINFO message received on a routing socket. * The return value indicates whether a full interface scan is required. - * Link up/down notifications from the NICs are reflected in the - * IFF_RUNNING flag. + * Link up/down notifications are reflected in the IFF_RUNNING flag. * If just the state of the IFF_RUNNING interface flag has changed, a * a full interface scan isn't required. */ @@ -1400,7 +1343,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type) /* * We want to try and avoid doing a full interface scan for - * link state notifications from the NICs, as indicated + * link state notifications from the datalink layer, as indicated * by the state of the IFF_RUNNING flag. If just the * IFF_RUNNING flag has changed state, the link state changes * are processed without a full scan. @@ -1441,25 +1384,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type) * types. */ if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) - phyint_newtype(pi); - - /* - * If IFF_INACTIVE has been set, then no data addresses should be - * hosted on the interface. If IFF_INACTIVE has been cleared, then - * move previously failed-over addresses back to it, provided it is - * not failed. For details, see the state diagram in mpd_probe.c. - */ - if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { - if (pii->pii_flags & IFF_INACTIVE) { - if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) - (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); - } else { - if (pi->pi_state == PI_RUNNING && !pi->pi_full) { - pi->pi_empty = 0; - (void) try_failback(pi); - } - } - } + phyint_changed(pi); /* Has just the IFF_RUNNING flag changed state ? */ if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { @@ -1620,22 +1545,24 @@ update_router_list(int fd) t_scalar_t prim; tor = (struct T_optmgmt_req *)&buf; - tor->PRIM_type = T_SVR4_OPTMGMT_REQ; tor->OPT_offset = sizeof (struct T_optmgmt_req); tor->OPT_length = sizeof (struct opthdr); tor->MGMT_flags = T_CURRENT; + /* + * Note: we use the special level value below so that IP will return + * us information concerning IRE_MARK_TESTHIDDEN routes. + */ req = (struct opthdr *)&tor[1]; - req->level = MIB2_IP; /* any MIB2_xxx value ok here */ + req->level = EXPER_IP_AND_TESTHIDDEN; req->name = 0; req->len = 0; ctlbuf.buf = (char *)&buf; ctlbuf.len = tor->OPT_length + tor->OPT_offset; ctlbuf.maxlen = sizeof (buf); - flags = 0; - if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { + if (putmsg(fd, &ctlbuf, NULL, 0) == -1) { logperror("update_router_list: putmsg(ctl)"); return (_B_FALSE); } @@ -1689,7 +1616,8 @@ update_router_list(int fd) case T_OPTMGMT_ACK: toa = &buf.uprim.optmgmt_ack; optp = (struct opthdr *)&toa[1]; - if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { + if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) + + sizeof (struct opthdr))) { logerr("update_router_list: ctlbuf.len %d\n", ctlbuf.len); return (_B_FALSE); @@ -1707,7 +1635,7 @@ update_router_list(int fd) return (_B_FALSE); } - /* Process the T_OPGMGMT_ACK below */ + /* Process the T_OPTMGMT_ACK below */ assert(prim == T_OPTMGMT_ACK); switch (status) { @@ -1717,9 +1645,8 @@ update_router_list(int fd) * message. If this is the last message i.e EOD, * return, else process the next T_OPTMGMT_ACK msg. */ - if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + - sizeof (struct opthdr)) && optp->len == 0 && - optp->name == 0 && optp->level == 0) { + if (optp->len == 0 && optp->name == 0 && + optp->level == 0) { /* * This is the EOD message. Return */ @@ -1747,17 +1674,14 @@ update_router_list(int fd) databuf.len = 0; flags = 0; for (;;) { - status = getmsg(fd, NULL, &databuf, &flags); - if (status >= 0) { + if (getmsg(fd, NULL, &databuf, &flags) >= 0) break; - } else if (errno == EINTR) { + if (errno == EINTR) continue; - } else { - logperror("update_router_list:" - " getmsg(data)"); - free(databuf.buf); - return (_B_FALSE); - } + + logperror("update_router_list: getmsg(data)"); + free(databuf.buf); + return (_B_FALSE); } if (optp->level == MIB2_IP && @@ -1777,18 +1701,35 @@ update_router_list(int fd) /* NOTREACHED */ } + +/* + * Convert octet `octp' to a phyint name and store in `ifname' + */ +static void +oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize) +{ + char *cp; + size_t len = MIN(octp->o_length, ifsize - 1); + + (void) strncpy(ifname, octp->o_bytes, len); + ifname[len] = '\0'; + + if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL) + *cp = '\0'; +} + /* - * Examine the IPv4 routing table, for default routers. For each default - * router, populate the list of targets of each phyint that is on the same - * link as the default router + * Examine the IPv4 routing table `buf' for possible targets. For each + * possible target, if it's on the same subnet an interface route, pass + * it to router_add_common() for further consideration. */ static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) { - mib2_ipRouteEntry_t *rp; - mib2_ipRouteEntry_t *rp1; - struct in_addr nexthop_v4; - mib2_ipRouteEntry_t *endp; + char ifname[LIFNAMSIZ]; + mib2_ipRouteEntry_t *rp, *rp1, *endp; + struct in_addr nexthop_v4; + struct in6_addr nexthop; if (len == 0) return; @@ -1797,75 +1738,40 @@ ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); /* - * Loop thru the routing table entries. Process any IRE_DEFAULT, - * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. - * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. - * This is a potential target for probing, which we try to add - * to the list of probe targets. + * Scan the routing table entries for any IRE_OFFSUBNET entries, and + * cross-reference them with the interface routes to determine if + * they're possible probe targets. */ for (rp = buf; rp < endp; rp++) { if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) continue; - /* Get the nexthop address. */ + /* Get the nexthop address. */ nexthop_v4.s_addr = rp->ipRouteNextHop; /* - * Get the nexthop address. Then determine the outgoing - * interface, by examining all interface IREs, and picking the - * match. We don't look at the interface specified in the route - * because we need to add the router target on all matching - * interfaces anyway; the goal is to avoid falling back to - * multicast when some interfaces are in the same subnet but - * not in the same group. + * Rescan the routing table looking for interface routes that + * are on the same subnet, and try to add them. If they're + * not relevant (e.g., the interface route isn't part of an + * IPMP group, router_add_common() will discard). */ for (rp1 = buf; rp1 < endp; rp1++) { - if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { + if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) || + rp1->ipRouteIfIndex.o_length == 0) continue; - } - /* - * Determine the interface IRE that matches the nexthop. - * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) - */ - if ((rp1->ipRouteDest & rp1->ipRouteMask) == - (nexthop_v4.s_addr & rp1->ipRouteMask)) { - /* - * We found the interface ire - */ - router_add_v4(rp1, nexthop_v4); - } + if ((rp1->ipRouteDest & rp1->ipRouteMask) != + (nexthop_v4.s_addr & rp1->ipRouteMask)) + continue; + + oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ); + IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); + router_add_common(AF_INET, ifname, nexthop); } } } void -router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) -{ - char *cp; - char ifname[LIFNAMSIZ + 1]; - struct in6_addr nexthop; - int len; - - if (debug & D_TARGET) - logdebug("router_add_v4()\n"); - - len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); - (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); - ifname[len] = '\0'; - - if (ifname[0] == '\0') - return; - - cp = strchr(ifname, IF_SEPARATOR); - if (cp != NULL) - *cp = '\0'; - - IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); - router_add_common(AF_INET, ifname, nexthop); -} - -void router_add_common(int af, char *ifname, struct in6_addr nexthop) { struct phyint_instance *pii; @@ -1906,16 +1812,17 @@ router_add_common(int af, char *ifname, struct in6_addr nexthop) } /* - * Examine the IPv6 routing table, for default routers. For each default - * router, populate the list of targets of each phyint that is on the same - * link as the default router + * Examine the IPv6 routing table `buf' for possible link-local targets, and + * pass any contenders to router_add_common() for further consideration. */ static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) { - mib2_ipv6RouteEntry_t *rp; - mib2_ipv6RouteEntry_t *endp; - struct in6_addr nexthop_v6; + struct lifreq lifr; + char ifname[LIFNAMSIZ]; + char grname[LIFGRNAMSIZ]; + mib2_ipv6RouteEntry_t *rp, *rp1, *endp; + struct in6_addr nexthop_v6; if (debug & D_TARGET) logdebug("ire_process_v6(len %d)\n", len); @@ -1927,62 +1834,51 @@ ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); /* - * Loop thru the routing table entries. Process any IRE_DEFAULT, - * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. - * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. - * This is a potential target for probing, which we try to add - * to the list of probe targets. + * Scan the routing table entries for any IRE_OFFSUBNET entries, and + * cross-reference them with the interface routes to determine if + * they're possible probe targets. */ for (rp = buf; rp < endp; rp++) { - if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) + if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) || + !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop)) continue; - /* - * We have the outgoing interface in ipv6RouteIfIndex - * if ipv6RouteIfindex.o_length is non-zero. The outgoing - * interface must be present for link-local addresses. Since - * we use only link-local addreses for probing, we don't - * consider the case when the outgoing interface is not - * known and we need to scan interface ires - */ + /* Get the nexthop address. */ nexthop_v6 = rp->ipv6RouteNextHop; - if (rp->ipv6RouteIfIndex.o_length != 0) { - /* - * We already have the outgoing interface - * in ipv6RouteIfIndex. - */ - router_add_v6(rp, nexthop_v6); - } - } -} - -void -router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) -{ - char ifname[LIFNAMSIZ + 1]; - char *cp; - int len; - - if (debug & D_TARGET) - logdebug("router_add_v6()\n"); - - len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); - (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); - ifname[len] = '\0'; + /* + * The interface name should always exist for link-locals; + * we use it to map this entry to an IPMP group name. + */ + if (rp->ipv6RouteIfIndex.o_length == 0) + continue; - if (ifname[0] == '\0') - return; + oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ); + if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 || + strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) { + continue; + } - cp = strchr(ifname, IF_SEPARATOR); - if (cp != NULL) - *cp = '\0'; + /* + * Rescan the list of routes for interface routes, and add the + * above target to any interfaces in the same IPMP group. + */ + for (rp1 = buf; rp1 < endp; rp1++) { + if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) || + rp1->ipv6RouteIfIndex.o_length == 0) { + continue; + } + oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ); + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); - router_add_common(AF_INET6, ifname, nexthop_v6); + if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 && + strcmp(lifr.lifr_groupname, grname) == 0) { + router_add_common(AF_INET6, ifname, nexthop_v6); + } + } + } } - - /* * Build a list of target routers, by scanning the routing tables. * It is assumed that interface routes exist, to reach the routers. @@ -2001,11 +1897,9 @@ init_router_targets(void) for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { pi = pii->pii_phyint; /* - * Exclude ptp and host targets. Set tg_in_use to false, - * only for router targets. + * Set tg_in_use to false only for router targets. */ - if (!pii->pii_targets_are_routers || - (pi->pi_flags & IFF_POINTOPOINT)) + if (!pii->pii_targets_are_routers) continue; for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) @@ -2026,15 +1920,21 @@ init_router_targets(void) } for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { - if (!pii->pii_targets_are_routers || - (pi->pi_flags & IFF_POINTOPOINT)) + pi = pii->pii_phyint; + if (!pii->pii_targets_are_routers) continue; for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { next_tg = tg->tg_next; - if (!tg->tg_in_use) { + /* + * If the group has failed, it's likely the route was + * removed by an application affected by that failure. + * In that case, we keep the target so that we can + * reliably repair, at which point we'll refresh the + * target list again. + */ + if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group)) target_delete(tg); - } } } } @@ -2140,7 +2040,7 @@ getdefault(char *name) * Command line options below */ boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ -boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ +boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */ static boolean_t adopt = _B_FALSE; static boolean_t foreground = _B_FALSE; @@ -2149,6 +2049,7 @@ main(int argc, char *argv[]) { int i; int c; + struct phyint *pi; struct phyint_instance *pii; char *value; @@ -2173,14 +2074,15 @@ main(int argc, char *argv[]) if (user_failure_detection_time <= 0) { user_failure_detection_time = FAILURE_DETECTION_TIME; logerr("Invalid failure detection time %s, assuming " - "default %d\n", value, user_failure_detection_time); + "default of %d ms\n", value, + user_failure_detection_time); } else if (user_failure_detection_time < MIN_FAILURE_DETECTION_TIME) { user_failure_detection_time = MIN_FAILURE_DETECTION_TIME; logerr("Too small failure detection time of %s, " - "assuming minimum %d\n", value, + "assuming minimum of %d ms\n", value, user_failure_detection_time); } free(value); @@ -2211,9 +2113,9 @@ main(int argc, char *argv[]) */ value = getdefault("FAILBACK"); if (value != NULL) { - if (strncasecmp(value, "yes", 3) == 0) + if (strcasecmp(value, "yes") == 0) failback_enabled = _B_TRUE; - else if (strncasecmp(value, "no", 2) == 0) + else if (strcasecmp(value, "no") == 0) failback_enabled = _B_FALSE; else logerr("Invalid value for FAILBACK %s\n", value); @@ -2229,9 +2131,9 @@ main(int argc, char *argv[]) */ value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); if (value != NULL) { - if (strncasecmp(value, "yes", 3) == 0) + if (strcasecmp(value, "yes") == 0) track_all_phyints = _B_FALSE; - else if (strncasecmp(value, "no", 2) == 0) + else if (strcasecmp(value, "no") == 0) track_all_phyints = _B_TRUE; else logerr("Invalid value for " @@ -2340,12 +2242,6 @@ main(int argc, char *argv[]) initifs(); - /* Inform kernel whether failback is enabled or disabled */ - if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { - logperror("main: ioctl (SIOCSIPMPFAILBACK)"); - exit(1); - } - /* * If we're operating in "adopt" mode and no interfaces need to be * tracked, shut down (ifconfig(1M) will restart us on demand if @@ -2379,6 +2275,7 @@ main(int argc, char *argv[]) process_rtsock(rtsock_v4, rtsock_v6); break; } + for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { if (pollfds[i].fd == pii->pii_probe_sock) { @@ -2389,15 +2286,21 @@ main(int argc, char *argv[]) break; } } + + for (pi = phyints; pi != NULL; pi = pi->pi_next) { + if (pi->pi_notes != 0 && + pollfds[i].fd == dlpi_fd(pi->pi_dh)) { + (void) dlpi_recv(pi->pi_dh, NULL, NULL, + NULL, NULL, 0, NULL); + break; + } + } + if (pollfds[i].fd == lsock_v4) loopback_cmd(lsock_v4, AF_INET); else if (pollfds[i].fd == lsock_v6) loopback_cmd(lsock_v6, AF_INET6); } - if (full_scan_required) { - initifs(); - full_scan_required = _B_FALSE; - } } /* NOTREACHED */ return (EXIT_SUCCESS); @@ -2481,29 +2384,23 @@ static struct { { "MI_PING", sizeof (uint32_t) }, { "MI_OFFLINE", sizeof (mi_offline_t) }, { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, - { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, { "MI_QUERY", sizeof (mi_query_t) } }; /* - * Commands received over the loopback interface come here. Currently - * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP - * module. ifconfig only makes a connection, and closes it to check if - * in.mpathd is running. - * if_mpadm sends commands in the format specified by the mpathd_interface - * structure. + * Commands received over the loopback interface come here (via libipmp). */ static void loopback_cmd(int sock, int family) { int newfd; ssize_t len; + boolean_t is_priv = _B_FALSE; struct sockaddr_storage peer; struct sockaddr_in *peer_sin; struct sockaddr_in6 *peer_sin6; socklen_t peerlen; union mi_commands mpi; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; char abuf[INET6_ADDRSTRLEN]; uint_t cmd; int retval; @@ -2528,10 +2425,11 @@ loopback_cmd(int sock, int family) return; } peer_sin = (struct sockaddr_in *)&peer; - if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || - (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { - (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, - abuf, sizeof (abuf)); + is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED; + (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, + abuf, sizeof (abuf)); + + if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) { logerr("Attempt to connect from addr %s port %d\n", abuf, ntohs(peer_sin->sin_port)); (void) close(newfd); @@ -2551,11 +2449,10 @@ loopback_cmd(int sock, int family) * talking to us. */ peer_sin6 = (struct sockaddr_in6 *)&peer; - if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || - (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, - &loopback_addr))) { - (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, - sizeof (abuf)); + is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED; + (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, + sizeof (abuf)); + if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) { logerr("Attempt to connect from addr %s port %d\n", abuf, ntohs(peer_sin6->sin6_port)); (void) close(newfd); @@ -2575,15 +2472,6 @@ loopback_cmd(int sock, int family) len = read(newfd, &mpi, sizeof (mpi)); /* - * ifconfig does not send any data. Just tests to see if mpathd - * is already running. - */ - if (len <= 0) { - (void) close(newfd); - return; - } - - /* * In theory, we can receive any sized message for a stream socket, * but we don't expect that to happen for a small message over a * loopback connection. @@ -2591,6 +2479,8 @@ loopback_cmd(int sock, int family) if (len < sizeof (uint32_t)) { logerr("loopback_cmd: bad command format or read returns " "partial data %d\n", len); + (void) close(newfd); + return; } cmd = mpi.mi_command; @@ -2600,6 +2490,16 @@ loopback_cmd(int sock, int family) return; } + /* + * Only MI_PING and MI_QUERY can come from unprivileged sources. + */ + if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) { + logerr("Unprivileged request from %s for privileged " + "command %s\n", abuf, commands[cmd].name); + (void) close(newfd); + return; + } + if (len < commands[cmd].size) { logerr("loopback_cmd: short %s command (expected %d, got %d)\n", commands[cmd].name, commands[cmd].size, len); @@ -2615,179 +2515,46 @@ loopback_cmd(int sock, int family) (void) close(newfd); } -extern int global_errno; /* set by failover() or failback() */ - /* - * Process the offline, undo offline and set original index commands, - * received from if_mpadm(1M) + * Process the commands received via libipmp. */ static unsigned int process_cmd(int newfd, union mi_commands *mpi) { - uint_t nif = 0; - uint32_t cmd; struct phyint *pi; - struct phyint *pi2; - struct phyint_group *pg; - boolean_t success; - int error; struct mi_offline *mio; struct mi_undo_offline *miu; - struct lifreq lifr; - int ifsock; - struct mi_setoindex *mis; + unsigned int retval; - cmd = mpi->mi_command; + switch (mpi->mi_command) { + case MI_PING: + return (send_result(newfd, IPMP_SUCCESS, 0)); - switch (cmd) { case MI_OFFLINE: mio = &mpi->mi_ocmd; - /* - * Lookup the interface that needs to be offlined. - * If it does not exist, return a suitable error. - */ + pi = phyint_lookup(mio->mio_ifname); if (pi == NULL) - return (send_result(newfd, IPMP_FAILURE, EINVAL)); - - /* - * Verify that the minimum redundancy requirements are met. - * The multipathing group must have at least the specified - * number of functional interfaces after offlining the - * requested interface. Otherwise return a suitable error. - */ - pg = pi->pi_group; - nif = 0; - if (pg != phyint_anongroup) { - for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; - pi2 = pi2->pi_pgnext) { - if ((pi2->pi_state == PI_RUNNING) || - (pg->pg_groupfailed && - !(pi2->pi_flags & IFF_OFFLINE))) - nif++; - } - } - if (nif < mio->mio_min_redundancy) - return (send_result(newfd, IPMP_EMINRED, 0)); + return (send_result(newfd, IPMP_EUNKIF, 0)); - /* - * The order of operation is to set IFF_OFFLINE, followed by - * failover. Setting IFF_OFFLINE ensures that no new ipif's - * can be created. Subsequent failover moves everything on - * the OFFLINE interface to some other functional interface. - */ - success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); - if (success) { - if (!pi->pi_empty) { - error = try_failover(pi, FAILOVER_NORMAL); - if (error != 0) { - if (!change_lif_flags(pi, IFF_OFFLINE, - _B_FALSE)) { - logerr("process_cmd: couldn't" - " clear OFFLINE flag on" - " %s\n", pi->pi_name); - /* - * Offline interfaces should - * not be probed. - */ - stop_probing(pi); - } - return (send_result(newfd, error, - global_errno)); - } - } - } else { + retval = phyint_offline(pi, mio->mio_min_redundancy); + if (retval == IPMP_FAILURE) return (send_result(newfd, IPMP_FAILURE, errno)); - } - /* - * The interface is now Offline, so stop probing it. - * Note that if_mpadm(1M) will down the test addresses, - * after receiving a success reply from us. The routing - * socket message will then make us close the socket used - * for sending probes. But it is more logical that an - * offlined interface must not be probed, even if it has - * test addresses. - */ - stop_probing(pi); - return (send_result(newfd, IPMP_SUCCESS, 0)); + return (send_result(newfd, retval, 0)); case MI_UNDO_OFFLINE: miu = &mpi->mi_ucmd; - /* - * Undo the offline command. As usual lookup the interface. - * Send an error if it does not exist or is not offline. - */ - pi = phyint_lookup(miu->miu_ifname); - if (pi == NULL || pi->pi_state != PI_OFFLINE) - return (send_result(newfd, IPMP_FAILURE, EINVAL)); - - /* - * Reset the state of the interface based on the current link - * state; if this phyint subsequently acquires a test address, - * the state will be updated later as a result of the probes. - */ - if (LINK_UP(pi)) - phyint_chstate(pi, PI_RUNNING); - else - phyint_chstate(pi, PI_FAILED); - - if (pi->pi_state == PI_RUNNING) { - /* - * Note that the success of MI_UNDO_OFFLINE is not - * contingent on actually failing back; in the odd - * case where we cannot do it here, we will try again - * in initifs() since pi->pi_full will still be zero. - */ - if (do_failback(pi) != IPMP_SUCCESS) { - logdebug("process_cmd: cannot failback from " - "%s during MI_UNDO_OFFLINE\n", pi->pi_name); - } - } - - /* - * Clear the IFF_OFFLINE flag. We have to do this last - * because do_failback() relies on it being set to decide - * when to display messages. - */ - (void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE); - - /* - * Give the requestor time to configure test addresses - * before complaining that they're missing. - */ - pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; - - return (send_result(newfd, IPMP_SUCCESS, 0)); - - case MI_SETOINDEX: - mis = &mpi->mi_scmd; - /* Get the socket for doing ioctls */ - ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; - - /* - * Get index of new original interface. - * The index is returned in lifr.lifr_index. - */ - (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, - sizeof (lifr.lifr_name)); + pi = phyint_lookup(miu->miu_ifname); + if (pi == NULL) + return (send_result(newfd, IPMP_EUNKIF, 0)); - if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) + retval = phyint_undo_offline(pi); + if (retval == IPMP_FAILURE) return (send_result(newfd, IPMP_FAILURE, errno)); - /* - * Set new original interface index. - * The new index was put into lifr.lifr_index by the - * SIOCGLIFINDEX ioctl. - */ - (void) strlcpy(lifr.lifr_name, mis->mis_lifname, - sizeof (lifr.lifr_name)); - - if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) - return (send_result(newfd, IPMP_FAILURE, errno)); - - return (send_result(newfd, IPMP_SUCCESS, 0)); + return (send_result(newfd, retval, 0)); case MI_QUERY: return (process_query(newfd, &mpi->mi_qcmd)); @@ -2806,6 +2573,8 @@ process_cmd(int newfd, union mi_commands *mpi) static unsigned int process_query(int fd, mi_query_t *miq) { + ipmp_addrinfo_t *adinfop; + ipmp_addrinfolist_t *adlp; ipmp_groupinfo_t *grinfop; ipmp_groupinfolist_t *grlp; ipmp_grouplist_t *grlistp; @@ -2815,6 +2584,19 @@ process_query(int fd, mi_query_t *miq) unsigned int retval; switch (miq->miq_inforeq) { + case IPMP_ADDRINFO: + retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr, + &adinfop); + if (retval != IPMP_SUCCESS) + return (send_result(fd, retval, errno)); + + retval = send_result(fd, IPMP_SUCCESS, 0); + if (retval == IPMP_SUCCESS) + retval = send_addrinfo(fd, adinfop); + + ipmp_freeaddrinfo(adinfop); + return (retval); + case IPMP_GROUPLIST: retval = getgrouplist(&grlistp); if (retval != IPMP_SUCCESS) @@ -2829,7 +2611,7 @@ process_query(int fd, mi_query_t *miq) case IPMP_GROUPINFO: miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; - retval = getgroupinfo(miq->miq_ifname, &grinfop); + retval = getgroupinfo(miq->miq_grname, &grinfop); if (retval != IPMP_SUCCESS) return (send_result(fd, retval, errno)); @@ -2854,6 +2636,11 @@ process_query(int fd, mi_query_t *miq) return (retval); case IPMP_SNAP: + /* + * Before taking the snapshot, sync with the kernel. + */ + initifs(); + retval = getsnap(&snap); if (retval != IPMP_SUCCESS) return (send_result(fd, retval, errno)); @@ -2883,6 +2670,13 @@ process_query(int fd, mi_query_t *miq) if (retval != IPMP_SUCCESS) goto out; } + + adlp = snap->sn_adinfolistp; + for (; adlp != NULL; adlp = adlp->adl_next) { + retval = send_addrinfo(fd, adlp->adl_adinfop); + if (retval != IPMP_SUCCESS) + goto out; + } out: ipmp_snap_free(snap); return (retval); @@ -2902,14 +2696,20 @@ static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) { ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; unsigned int retval; retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); if (retval != IPMP_SUCCESS) return (retval); - return (ipmp_writetlv(fd, IPMP_IFLIST, - IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); + retval = ipmp_writetlv(fd, IPMP_IFLIST, + IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp); + if (retval != IPMP_SUCCESS) + return (retval); + + return (ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp)); } /* @@ -2919,7 +2719,31 @@ send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) { - return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); + ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp; + ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp; + unsigned int retval; + + retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop); + if (retval != IPMP_SUCCESS) + return (retval); + + retval = ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p); + if (retval != IPMP_SUCCESS) + return (retval); + + return (ipmp_writetlv(fd, IPMP_ADDRLIST, + IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p)); +} + +/* + * Send the address information pointed to by `adinfop' on file descriptor + * `fd'. Returns an IPMP error code. + */ +static unsigned int +send_addrinfo(int fd, ipmp_addrinfo_t *adinfop) +{ + return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop)); } /* @@ -3109,3 +2933,32 @@ close_probe_socket(struct phyint_instance *pii, boolean_t polled) pii->pii_probe_sock = -1; pii->pii_basetime_inited = 0; } + +boolean_t +addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags, + struct sockaddr_storage *ssp) +{ + addrlist_t *addrp; + + if ((addrp = malloc(sizeof (addrlist_t))) == NULL) + return (_B_FALSE); + + (void) strlcpy(addrp->al_name, name, LIFNAMSIZ); + addrp->al_flags = flags; + addrp->al_addr = *ssp; + addrp->al_next = *addrsp; + *addrsp = addrp; + return (_B_TRUE); +} + +void +addrlist_free(addrlist_t **addrsp) +{ + addrlist_t *addrp, *next_addrp; + + for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) { + next_addrp = addrp->al_next; + free(addrp); + } + *addrsp = NULL; +} diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c index a2ff76a983..cf327fbaff 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -20,8 +20,6 @@ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -45,7 +43,7 @@ struct pr_icmp uint16_t pr_icmp_cksum; /* checksum field */ uint16_t pr_icmp_id; /* Identification */ uint16_t pr_icmp_seq; /* sequence number */ - uint32_t pr_icmp_timestamp; /* Time stamp */ + uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ uint32_t pr_icmp_mtype; /* Message type */ }; @@ -58,11 +56,12 @@ static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ -static void *find_ancillary(struct msghdr *msg, int cmsg_type); -static void pi_set_crtt(struct target *tg, int m, +static void *find_ancillary(struct msghdr *msg, int cmsg_level, + int cmsg_type); +static void pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni); static void incoming_echo_reply(struct phyint_instance *pii, - struct pr_icmp *reply, struct in6_addr fromaddr); + struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr); static void incoming_mcast_reply(struct phyint_instance *pii, @@ -78,13 +77,11 @@ static void probe_success_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_success_count *psinfo); static boolean_t phyint_repaired(struct phyint *pi); -static int failover(struct phyint *from, struct phyint *to); -static int failback(struct phyint *from, struct phyint *to); -static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); - static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); static int in_cksum(ushort_t *addr, int len); static void reset_snxt_basetimes(void); +static int ns2ms(int64_t ns); +static int64_t tv2ns(struct timeval *); /* * CRTT - Conservative Round Trip Time Estimate @@ -104,7 +101,7 @@ static void reset_snxt_basetimes(void); * Phyint state diagram * * The state of a phyint that is capable of being probed, is completely - * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. + * specified by the 3-tuple <pi_state, pg_state, I>. * * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state * of the link (according to the driver). If the phyint is also configured @@ -117,8 +114,8 @@ static void reset_snxt_basetimes(void); * state, which indicates that the link is apparently functional but that * in.mpathd is unable to send probes to verify functionality (in this case, * in.mpathd makes the optimistic assumption that the interface is working - * correctly and thus does not perform a failover, but reports the interface - * as IPMP_IF_UNKNOWN through the async events and query interfaces). + * correctly and thus does not mark the interface FAILED, but reports it as + * IPMP_IF_UNKNOWN through the async events and query interfaces). * * At any point, a phyint may be administratively marked offline via if_mpadm. * In this case, the interface always transitions to PI_OFFLINE, regardless @@ -131,8 +128,11 @@ static void reset_snxt_basetimes(void); * PI_RUNNING: The failure detection logic says the phyint is good. * PI_FAILED: The failure detection logic says the phyint has failed. * - * pg_groupfailed - Group failure, all interfaces in the group have failed. - * The pi_state may be either PI_FAILED or PI_NOTARGETS. + * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. + * PG_OK: All interfaces in the group are OK. + * PG_DEGRADED: Some interfaces in the group are unusable. + * PG_FAILED: All interfaces in the group are unusable. + * * In the case of router targets, we assume that the current list of * targets obtained from the routing table, is still valid, so the * phyint stat is PI_FAILED. In the case of host targets, we delete the @@ -140,144 +140,46 @@ static void reset_snxt_basetimes(void); * target list. So the phyints are in the PI_NOTARGETS state. * * I - value of (pi_flags & IFF_INACTIVE) - * IFF_INACTIVE: No failovers have been done to this phyint, from - * other phyints. This phyint is inactive. Phyint can be a Standby. - * When failback has been disabled (FAILOVER=no configured), - * phyint can also be a non-STANDBY. In this case IFF_INACTIVE - * is set when phyint subsequently recovers after a failure. - * - * pi_empty - * This phyint has failed over successfully to another phyint, and - * this phyint is currently "empty". It does not host any addresses or - * multicast membership etc. This is the state of a phyint after a - * failover from the phyint has completed successfully and no subsequent - * 'failover to' or 'failback to' has occurred on the phyint. - * IP guarantees that no new logicals will be hosted nor any multicast - * joins permitted on the phyint, since the phyint is either failed or - * inactive. pi_empty is set implies the phyint is either failed or - * inactive. - * - * pi_full - * The phyint hosts all of its own addresses that it "owns". If the - * phyint was previously failed or inactive, failbacks to the phyint - * has completed successfully. i.e. No more failbacks to this phyint - * can produce any change in system state whatsoever. - * - * Not all 32 possible combinations of the above 5-tuple are possible. - * Furthermore some of the above combinations are transient. They may occur - * only because the failover or failback did not complete successfully. The - * failover/failback will be retried and eventually a stable state will be - * reached. - * - * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. - * The following are the state machines. 'from' and 'to' are the src and - * dst of the failover/failback, below - * - * pi_empty state machine - * --------------------------------------------------------------------------- - * Event State -> New State - * --------------------------------------------------------------------------- - * successful completion from.pi_empty = 0 -> from.pi_empty = 1 - * of failover + * IFF_INACTIVE: This phyint will not send or receive packets. + * Usually, inactive is tied to standby interfaces that are not yet + * needed (e.g., no non-standby interfaces in the group have failed). + * When failback has been disabled (FAILBACK=no configured), phyint can + * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint + * subsequently recovers after a failure. * - * Initiate failover to.pi_empty = X -> to.pi_empty = 0 + * Not all 9 possible combinations of the above 3-tuple are possible. * - * Initiate failback to.pi_empty = X -> to.pi_empty = 0 - * - * group failure pi_empty = X -> pi_empty = 0 - * --------------------------------------------------------------------------- - * - * pi_full state machine - * --------------------------------------------------------------------------- - * Event State -> New State - * --------------------------------------------------------------------------- - * successful completion to.pi_full = 0 -> to.pi_full = 1 - * of failback from - * each of the other phyints - * - * Initiate failover from.pi_full = X -> from.pi_full = 0 - * - * group failure pi_full = X -> pi_full = 0 - * --------------------------------------------------------------------------- + * I is tracked by IP. pi_state is tracked by mpathd. * * pi_state state machine * --------------------------------------------------------------------------- * Event State New State * Action: * --------------------------------------------------------------------------- - * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) + * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint - * : failover from this phyint to another * - * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) + * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint * - * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) + * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) * detection -> (PI_RUNNING, I == 0) - * : to.pi_empty = 0 * : clear IFF_FAILED on this phyint - * : failback to this phyint if enabled * - * NIC repair (PI_FAILED, I == 0, FAILBACK=no) + * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) * detection -> (PI_RUNNING, I == 1) - * : to.pi_empty = 0 * : clear IFF_FAILED on this phyint * : if failback is disabled set I == 1 * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_FAILED * (Router targets) : set IFF_FAILED - * : clear pi_empty and pi_full * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_NOTARGETS * (Host targets) : set IFF_FAILED - * : clear pi_empty and pi_full * : delete the target list on all phyints * --------------------------------------------------------------------------- - * - * I state machine - * --------------------------------------------------------------------------- - * Event State Action: - * --------------------------------------------------------------------------- - * Turn on I pi_empty == 0, STANDBY : failover from standby - * - * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 - * pi_full == 0 : failback to this if enabled - * --------------------------------------------------------------------------- - * - * Assertions: (Read '==>' as implies) - * - * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) - * (pi_empty == 1) ==> (pi_full == 0) - * (pi_full == 1) ==> (pi_empty == 0) - * - * Invariants - * - * pg_groupfailed = 0 && - * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby - * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint - * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint - * - * 1. says that an inactive standby, that is not empty, has to be failed - * over. For a standby to be truly inactive, it should not host any - * addresses. So we move them to some other phyint. Usually we catch the - * turn on of IFF_INACTIVE, and perform this action. However if the failover - * did not complete successfully, then subsequently we have lost the edge - * trigger, and this invariant kicks in and completes the action. - * - * 2. says that any failed phyint that is not empty must be failed over. - * Usually we do the failover when we detect NIC failure. However if the - * failover does not complete successfully, this invariant kicks in and - * completes the failover. We exclude inactive standby which is covered by 1. - * - * 3. says that any running phyint that is not full must be failed back. - * Usually we do the failback when we detect NIC repair. However if the - * failback does not complete successfully, this invariant kicks in and - * completes the failback. Note that we don't want to failback to an inactive - * standby. - * - * The invariants 1 - 3 and the actions are in initifs(). */ struct probes_missed probes_missed; @@ -295,7 +197,7 @@ struct probes_missed probes_missed; * not less than the current CRTT. pii_probes[] stores data * about these probes. These packets consume sequence number space. * - * PROBE_RTT: This type is used to make only rtt measurments. Normally these + * PROBE_RTT: This type is used to make only rtt measurements. Normally these * are not used. Under heavy network load, the rtt may go up very high, * due to a spike, or may appear to go high, due to extreme scheduling * delays. Once the network stress is removed, mpathd takes long time to @@ -310,17 +212,19 @@ struct probes_missed probes_missed; * no targets are known. The packet is multicast to the all hosts addr. */ static void -probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) +probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) { + hrtime_t sent_hrtime; + struct timeval sent_tv; struct pr_icmp probe_pkt; /* Probe packet */ - struct sockaddr_in6 whereto6; /* target address IPv6 */ - struct sockaddr_in whereto; /* target address IPv4 */ + struct sockaddr_storage targ; /* target address */ + uint_t targaddrlen; /* targed address length */ int pr_ndx; /* probe index in pii->pii_probes[] */ boolean_t sent = _B_TRUE; if (debug & D_TARGET) { - logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), - pii->pii_name, probe_type, cur_time); + logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), + pii->pii_name, probe_type, start_hrtime); } assert(pii->pii_probe_sock != -1); @@ -339,7 +243,7 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) * network byte order at initialization itself. */ probe_pkt.pr_icmp_id = pii->pii_icmpid; - probe_pkt.pr_icmp_timestamp = htonl(cur_time); + probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); probe_pkt.pr_icmp_mtype = htonl(probe_type); /* @@ -349,38 +253,34 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && pii->pii_rtt_target_next != NULL)); + bzero(&targ, sizeof (targ)); + targ.ss_family = pii->pii_af; + if (pii->pii_af == AF_INET6) { - bzero(&whereto6, sizeof (whereto6)); - whereto6.sin6_family = AF_INET6; + struct in6_addr *addr6; + + addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; + targaddrlen = sizeof (struct sockaddr_in6); if (probe_type == PROBE_MULTI) { - whereto6.sin6_addr = all_nodes_mcast_v6; + *addr6 = all_nodes_mcast_v6; } else if (probe_type == PROBE_UNI) { - whereto6.sin6_addr = pii->pii_target_next->tg_address; - } else { - /* type is PROBE_RTT */ - whereto6.sin6_addr = - pii->pii_rtt_target_next->tg_address; - } - if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, - sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, - sizeof (whereto6)) != sizeof (probe_pkt)) { - logperror_pii(pii, "probe: probe sendto"); - sent = _B_FALSE; + *addr6 = pii->pii_target_next->tg_address; + } else { /* type is PROBE_RTT */ + *addr6 = pii->pii_rtt_target_next->tg_address; } } else { - bzero(&whereto, sizeof (whereto)); - whereto.sin_family = AF_INET; + struct in_addr *addr4; + + addr4 = &((struct sockaddr_in *)&targ)->sin_addr; + targaddrlen = sizeof (struct sockaddr_in); if (probe_type == PROBE_MULTI) { - whereto.sin_addr = all_nodes_mcast_v4; + *addr4 = all_nodes_mcast_v4; } else if (probe_type == PROBE_UNI) { IN6_V4MAPPED_TO_INADDR( - &pii->pii_target_next->tg_address, - &whereto.sin_addr); - } else { - /* type is PROBE_RTT */ + &pii->pii_target_next->tg_address, addr4); + } else { /* type is PROBE_RTT */ IN6_V4MAPPED_TO_INADDR( - &pii->pii_rtt_target_next->tg_address, - &whereto.sin_addr); + &pii->pii_rtt_target_next->tg_address, addr4); } /* @@ -388,12 +288,18 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) */ probe_pkt.pr_icmp_cksum = in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); - if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, - sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, - sizeof (whereto)) != sizeof (probe_pkt)) { - logperror_pii(pii, "probe: probe sendto"); - sent = _B_FALSE; - } + } + + /* + * Use the current time as the time we sent. Not atomic, but the best + * we can do from here. + */ + sent_hrtime = gethrtime(); + (void) gettimeofday(&sent_tv, NULL); + if (sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, + (struct sockaddr *)&targ, targaddrlen) != sizeof (probe_pkt)) { + logperror_pii(pii, "probe: probe sendto"); + sent = _B_FALSE; } /* @@ -415,9 +321,13 @@ probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) pii->pii_cum_stats.acked++; pii->pii_cum_stats.sent++; - pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; + pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; + pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; + pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; + pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; - pii->pii_probes[pr_ndx].pr_time_sent = cur_time; + probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); + pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); pii->pii_target_next = target_next(pii->pii_target_next); assert(pii->pii_target_next != NULL); @@ -448,33 +358,42 @@ in_data(struct phyint_instance *pii) { struct sockaddr_in from; struct in6_addr fromaddr; - uint_t fromlen; - static uint_t in_packet[(IP_MAXPACKET + 1)/4]; + static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; + static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; struct ip *ip; int iphlen; int len; char abuf[INET_ADDRSTRLEN]; - struct pr_icmp *reply; + struct msghdr msg; + struct iovec iov; + struct pr_icmp *reply; + struct timeval *recv_tvp; if (debug & D_PROBE) { logdebug("in_data(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } + iov.iov_base = (char *)in_packet; + iov.iov_len = sizeof (in_packet); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_name = (struct sockaddr *)&from; + msg.msg_namelen = sizeof (from); + msg.msg_control = ancillary_data; + msg.msg_controllen = sizeof (ancillary_data); + /* * Poll has already told us that a message is waiting, * on this socket. Read it now. We should not block. */ - fromlen = sizeof (from); - len = recvfrom(pii->pii_probe_sock, (char *)in_packet, - sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); - if (len < 0) { - logperror_pii(pii, "in_data: recvfrom"); + if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { + logperror_pii(pii, "in_data: recvmsg"); return; } /* - * If the NIC has indicated the link is down, don't go + * If the datalink has indicated the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) @@ -483,6 +402,15 @@ in_data(struct phyint_instance *pii) /* Get the printable address for error reporting */ (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); + /* Ignore packets > 64k or control buffers that don't fit */ + if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { + if (debug & D_PKTBAD) { + logdebug("Truncated message: msg_flags 0x%x from %s\n", + msg.msg_flags, abuf); + } + return; + } + /* Make sure packet contains at least minimum ICMP header */ ip = (struct ip *)in_packet; iphlen = ip->ip_hl << 2; @@ -528,10 +456,17 @@ in_data(struct phyint_instance *pii) return; } + recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); + if (recv_tvp == NULL) { + logtrace("message without timestamp from %s on %s\n", + abuf, pii->pii_name); + return; + } + IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) /* Unicast probe reply */ - incoming_echo_reply(pii, reply, fromaddr); + incoming_echo_reply(pii, reply, fromaddr, recv_tvp); else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { /* Multicast reply */ incoming_mcast_reply(pii, reply, fromaddr); @@ -543,7 +478,6 @@ in_data(struct phyint_instance *pii) reply->pr_icmp_mtype, abuf, pii->pii_name); return; } - } /* @@ -559,8 +493,9 @@ in6_data(struct phyint_instance *pii) char abuf[INET6_ADDRSTRLEN]; struct msghdr msg; struct iovec iov; - uchar_t *opt; + void *opt; struct pr_icmp *reply; + struct timeval *recv_tvp; if (debug & D_PROBE) { logdebug("in6_data(%s %s)\n", @@ -577,12 +512,12 @@ in6_data(struct phyint_instance *pii) msg.msg_controllen = sizeof (ancillary_data); if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { - logperror_pii(pii, "in6_data: recvfrom"); + logperror_pii(pii, "in6_data: recvmsg"); return; } /* - * If the NIC has indicated that the link is down, don't go + * If the datalink has indicated that the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) @@ -623,13 +558,14 @@ in6_data(struct phyint_instance *pii) "%s on %s\n", abuf, pii->pii_name); return; } - opt = find_ancillary(&msg, IPV6_RTHDR); + opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); if (opt != NULL) { /* Can't allow routing headers in probe replies */ logtrace("message with routing header from %s on %s\n", abuf, pii->pii_name); return; } + if (reply->pr_icmp_code != 0) { logtrace("probe reply code: %d from %s on %s\n", reply->pr_icmp_code, abuf, pii->pii_name); @@ -640,8 +576,16 @@ in6_data(struct phyint_instance *pii) len, abuf, pii->pii_name); return; } + + recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); + if (recv_tvp == NULL) { + logtrace("message without timestamp from %s on %s\n", + abuf, pii->pii_name); + return; + } + if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { - incoming_echo_reply(pii, reply, from.sin6_addr); + incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { incoming_mcast_reply(pii, reply, from.sin6_addr); } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { @@ -663,11 +607,9 @@ static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr) { - int m; /* rtt measurment in ms */ - uint32_t cur_time; /* in ms from some arbitrary point */ + int64_t m; /* rtt measurement in ns */ char abuf[INET6_ADDRSTRLEN]; struct target *target; - uint32_t pr_icmp_timestamp; struct phyint_group *pg; /* Get the printable address for error reporting */ @@ -683,10 +625,7 @@ incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, if (target == NULL) return; - pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); - cur_time = getcurrenttime(); - m = (int)(cur_time - pr_icmp_timestamp); - + m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); /* Invalid rtt. It has wrapped around */ if (m < 0) return; @@ -754,29 +693,30 @@ incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, */ static void incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, - struct in6_addr fromaddr) + struct in6_addr fromaddr, struct timeval *recv_tvp) { - int m; /* rtt measurment in ms */ - uint32_t cur_time; /* in ms from some arbitrary point */ + int64_t m; /* rtt measurement in ns */ + hrtime_t cur_hrtime; /* in ns from some arbitrary point */ char abuf[INET6_ADDRSTRLEN]; int pr_ndx; struct target *target; boolean_t exception; - uint32_t pr_icmp_timestamp; + uint64_t pr_icmp_timestamp; uint16_t pr_icmp_seq; + struct probe_stats *pr_statp; struct phyint_group *pg = pii->pii_phyint->pi_group; /* Get the printable address for error reporting */ (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); if (debug & D_PROBE) { - logdebug("incoming_echo_reply: %s %s %s seq %u\n", + logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", AF_STR(pii->pii_af), pii->pii_name, abuf, - ntohs(reply->pr_icmp_seq)); + ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); } - pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); - pr_icmp_seq = ntohs(reply->pr_icmp_seq); + pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); + pr_icmp_seq = ntohs(reply->pr_icmp_seq); /* Reject out of window probe replies */ if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || @@ -786,15 +726,16 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, pii->pii_cum_stats.unknown++; return; } - cur_time = getcurrenttime(); - m = (int)(cur_time - pr_icmp_timestamp); + + cur_hrtime = gethrtime(); + m = (int64_t)(cur_hrtime - pr_icmp_timestamp); if (m < 0) { /* * This is a ridiculously high value of rtt. rtt has wrapped * around. Log a message, and ignore the rtt. */ - logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " - "timestamp %u\n", cur_time, pr_icmp_timestamp); + logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " + "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); } /* @@ -868,10 +809,10 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, * debugger, or the system was hung or too busy for a * substantial time that we didn't get a chance to run. */ - if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { + if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { /* - * If the probe corresponding to this receieved response - * was truly sent 'm' ms. ago, then this response must + * If the probe corresponding to this received response + * was truly sent 'm' ns. ago, then this response must * have been rejected by the sequence number checks. The * fact that it has passed the sequence number checks * means that the measured rtt is wrong. We were probably @@ -947,7 +888,7 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, * adjusts pii->pii_target_next */ target_delete(target); - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } } else { /* @@ -999,8 +940,12 @@ incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, } } out: - pii->pii_probes[pr_ndx].pr_status = PR_ACKED; - pii->pii_probes[pr_ndx].pr_time_acked = cur_time; + pr_statp = &pii->pii_probes[pr_ndx]; + pr_statp->pr_hrtime_ackproc = cur_hrtime; + pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + + (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); + + probe_chstate(pr_statp, pii, PR_ACKED); /* * Update pii->pii_rack, i.e. the sequence number of the last received @@ -1240,13 +1185,13 @@ incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, * * New scaled average and deviation are passed back via sap and svp */ -static int -compute_crtt(int *sap, int *svp, int m) +static int64_t +compute_crtt(int64_t *sap, int64_t *svp, int64_t m) { - int sa = *sap; - int sv = *svp; - int crtt; - int saved_m = m; + int64_t sa = *sap; + int64_t sv = *svp; + int64_t crtt; + int64_t saved_m = m; assert(*sap >= -1); assert(*svp >= 0); @@ -1285,8 +1230,8 @@ compute_crtt(int *sap, int *svp, int m) crtt = (sa >> 3) + sv; if (debug & D_PROBE) { - logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " - "%d\n", saved_m, sa, sv, crtt); + logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " + "crtt = %lld\n", saved_m, sa, sv, crtt); } *sap = sa; @@ -1300,22 +1245,22 @@ compute_crtt(int *sap, int *svp, int m) } static void -pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) +pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) { struct phyint_instance *pii = tg->tg_phyint_inst; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; - int sa = tg->tg_rtt_sa; - int sv = tg->tg_rtt_sd; + int64_t sa = tg->tg_rtt_sa; + int64_t sv = tg->tg_rtt_sd; int new_crtt; int i; if (debug & D_PROBE) - logdebug("pi_set_crtt: target - m %d\n", m); + logdebug("pi_set_crtt: target - m %lld\n", m); /* store the round trip time, in case we need to defer computation */ tg->tg_deferred[tg->tg_num_deferred] = m; - new_crtt = compute_crtt(&sa, &sv, m); + new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); /* * If this probe's round trip time would singlehandedly cause an @@ -1342,8 +1287,8 @@ pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) } for (i = 0; i <= tg->tg_num_deferred; i++) { - tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, - &tg->tg_rtt_sd, tg->tg_deferred[i]); + tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, + &tg->tg_rtt_sd, tg->tg_deferred[i])); } tg->tg_num_deferred = 0; @@ -1373,13 +1318,13 @@ pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) * If not found return NULL. */ static void * -find_ancillary(struct msghdr *msg, int cmsg_type) +find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) { struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { - if (cmsg->cmsg_level == IPPROTO_IPV6 && + if (cmsg->cmsg_level == cmsg_level && cmsg->cmsg_type == cmsg_type) { return (CMSG_DATA(cmsg)); } @@ -1388,107 +1333,194 @@ find_ancillary(struct msghdr *msg, int cmsg_type) } /* - * See if a previously failed interface has started working again. + * Try to activate another INACTIVE interface in the same group as `pi'. + * Prefer STANDBY INACTIVE to just INACTIVE. */ void -phyint_check_for_repair(struct phyint *pi) +phyint_activate_another(struct phyint *pi) { - if (phyint_repaired(pi)) { - if (pi->pi_group == phyint_anongroup) { - logerr("NIC repair detected on %s\n", pi->pi_name); - } else { - logerr("NIC repair detected on %s of group %s\n", - pi->pi_name, pi->pi_group->pg_name); - } + struct phyint *pi2; + struct phyint *inactivepi = NULL; - /* - * If the interface is offline, just clear the FAILED flag, - * delaying the state change and failback operation until it - * is brought back online. - */ - if (pi->pi_state == PI_OFFLINE) { - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); - return; - } + if (pi->pi_group == phyint_anongroup) + return; - if (pi->pi_flags & IFF_STANDBY) { - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); - } else { - if (try_failback(pi) != IPMP_FAILURE) { - (void) change_lif_flags(pi, - IFF_FAILED, _B_FALSE); - /* Per state diagram */ - pi->pi_empty = 0; + for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi == pi2 || pi2->pi_state != PI_RUNNING || + !(pi2->pi_flags & IFF_INACTIVE)) + continue; + + inactivepi = pi2; + if (pi2->pi_flags & IFF_STANDBY) + break; + } + + if (inactivepi != NULL) + (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); +} + +/* + * Transition a phyint back to PI_RUNNING (from PI_FAILED or PI_OFFLINE). The + * caller must ensure that the transition is appropriate. Clears IFF_OFFLINE + * or IFF_FAILED, as appropriate. Also sets IFF_INACTIVE on this or other + * interfaces as appropriate (see comment below). Finally, also updates the + * phyint's group state to account for the change. + */ +void +phyint_transition_to_running(struct phyint *pi) +{ + struct phyint *pi2; + struct phyint *actstandbypi = NULL; + uint_t nactive = 0, nnonstandby = 0; + boolean_t onlining = (pi->pi_state == PI_OFFLINE); + uint64_t set, clear; + + /* + * The interface is running again, but should it or another interface + * in the group end up INACTIVE? There are three cases: + * + * 1. If it's a STANDBY interface, it should be end up INACTIVE if + * the group is operating at capacity (i.e., there are at least as + * many active interfaces as non-STANDBY interfaces in the group). + * No other interfaces should be changed. + * + * 2. If it's a non-STANDBY interface and we're onlining it or + * FAILBACK is enabled, then it should *not* end up INACTIVE. + * Further, if the group is above capacity as a result of this + * interface, then an active STANDBY interface in the group should + * end up INACTIVE. + * + * 3. If it's a non-STANDBY interface, we're repairing it, and + * FAILBACK is disabled, then it should end up INACTIVE *unless* + * the group was failed (in which case we have no choice but to + * use it). No other interfaces should be changed. + */ + if (pi->pi_group != phyint_anongroup) { + pi2 = pi->pi_group->pg_phyint; + for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (!(pi2->pi_flags & IFF_STANDBY)) + nnonstandby++; + + if (pi2->pi_state == PI_RUNNING) { + if (!(pi2->pi_flags & IFF_INACTIVE)) { + nactive++; + if (pi2->pi_flags & IFF_STANDBY) + actstandbypi = pi2; + } } } + } - phyint_chstate(pi, PI_RUNNING); + set = 0; + clear = (onlining ? IFF_OFFLINE : IFF_FAILED); - if (GROUP_FAILED(pi->pi_group)) { - /* - * This is the 1st phyint to receive a response - * after group failure. - */ - logerr("At least 1 interface (%s) of group %s has " - "repaired\n", pi->pi_name, pi->pi_group->pg_name); - phyint_group_chstate(pi->pi_group, PG_RUNNING); - } + if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ + if (nactive >= nnonstandby) + set |= IFF_INACTIVE; + else + clear |= IFF_INACTIVE; + } else if (onlining || failback_enabled) { /* case 2 */ + if (nactive >= nnonstandby && actstandbypi != NULL) + (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); + } else if (!GROUP_FAILED(pi->pi_group)) { /* case 3 */ + set |= IFF_INACTIVE; + } + (void) change_pif_flags(pi, set, clear); + + phyint_chstate(pi, PI_RUNNING); + + /* + * Update the group state to account for the change. + */ + phyint_group_refresh_state(pi->pi_group); +} + +/* + * See if a previously failed interface has started working again. + */ +void +phyint_check_for_repair(struct phyint *pi) +{ + if (!phyint_repaired(pi)) + return; + + if (pi->pi_group == phyint_anongroup) { + logerr("IP interface repair detected on %s\n", pi->pi_name); + } else { + logerr("IP interface repair detected on %s of group %s\n", + pi->pi_name, pi->pi_group->pg_name); } + + /* + * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. + * So just clear IFF_OFFLINE and defer phyint_transition_to_running() + * until it is brought back online. + */ + if (pi->pi_state == PI_OFFLINE) { + (void) change_pif_flags(pi, 0, IFF_FAILED); + return; + } + + phyint_transition_to_running(pi); /* calls phyint_chstate() */ } /* - * See if a previously functioning interface has failed, or if the - * whole group of interfaces has failed. + * See if an interface has failed, or if the whole group of interfaces has + * failed. */ static void phyint_inst_check_for_failure(struct phyint_instance *pii) { - struct phyint *pi; - struct phyint *pi2; - - pi = pii->pii_phyint; + struct phyint *pi = pii->pii_phyint; + struct phyint *pi2; + boolean_t was_active; switch (failure_state(pii)) { case PHYINT_FAILURE: - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); + was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); + + (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); if (pi->pi_group == phyint_anongroup) { - logerr("NIC failure detected on %s\n", pii->pii_name); + logerr("IP interface failure detected on %s\n", + pii->pii_name); } else { - logerr("NIC failure detected on %s of group %s\n", - pii->pii_name, pi->pi_group->pg_name); + logerr("IP interface failure detected on %s of group" + " %s\n", pii->pii_name, pi->pi_group->pg_name); } + /* - * Do the failover, unless the interface is offline (in - * which case we've already failed over). + * If the interface is offline, the state change will be + * noted when it comes back online. */ if (pi->pi_state != PI_OFFLINE) { + /* + * If the failed interface was active, activate + * another INACTIVE interface in the group if + * possible. (If the interface is PI_OFFLINE, + * we already activated another.) + */ + if (was_active) + phyint_activate_another(pi); + phyint_chstate(pi, PI_FAILED); reset_crtt_all(pi); - if (!(pi->pi_flags & IFF_INACTIVE)) - (void) try_failover(pi, FAILOVER_NORMAL); } break; case GROUP_FAILURE: - logerr("All Interfaces in group %s have failed\n", - pi->pi_group->pg_name); - for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; - pi2 = pi2->pi_pgnext) { - if (pi2->pi_flags & IFF_OFFLINE) + pi2 = pi->pi_group->pg_phyint; + for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { + (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); + if (pi2->pi_state == PI_OFFLINE) /* see comment above */ continue; - (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); - reset_crtt_all(pi2); + reset_crtt_all(pi2); /* - * In the case of host targets, we - * would have flushed the targets, - * and gone to PI_NOTARGETS state. + * In the case of host targets, we would have flushed + * the targets, and gone to PI_NOTARGETS state. */ if (pi2->pi_state == PI_RUNNING) phyint_chstate(pi2, PI_FAILED); - - pi2->pi_empty = 0; - pi2->pi_full = 0; } break; @@ -1519,7 +1551,8 @@ phyint_inst_timer(struct phyint_instance *pii) hrtime_t cur_hrtime; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; - cur_time = getcurrenttime(); + cur_hrtime = gethrtime(); + cur_time = ns2ms(cur_hrtime); if (debug & D_TIMER) { logdebug("phyint_inst_timer(%s %s)\n", @@ -1621,7 +1654,7 @@ phyint_inst_timer(struct phyint_instance *pii) * the failure detection (fd) probe timer has not yet fired. * Need to send only an rtt probe. The probe type is PROBE_RTT. */ - probe(pii, PROBE_RTT, cur_time); + probe(pii, PROBE_RTT, cur_hrtime); return (interval); } /* @@ -1651,7 +1684,7 @@ phyint_inst_timer(struct phyint_instance *pii) * We can have at most, the latest 2 probes that we sent, in * the PR_UNACKED state. All previous probes sent, are either * PR_LOST or PR_ACKED. An unacknowledged probe is considered - * timed out if the probe's time_sent + the CRTT < currenttime. + * timed out if the probe's time_start + the CRTT < currenttime. * For each of the last 2 probes, examine whether it has timed * out. If so, mark it PR_LOST. The probe stats is a circular array. */ @@ -1686,16 +1719,15 @@ phyint_inst_timer(struct phyint_instance *pii) * not available use group's probe interval, * which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (cur_tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + - cur_tg->tg_crtt; + timeout += cur_tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + - probe_interval; + timeout += probe_interval; } if (TIME_LT(timeout, cur_time)) { - pr_statp->pr_status = PR_LOST; pr_statp->pr_time_lost = timeout; + probe_chstate(pr_statp, pii, PR_LOST); } else if (i == 1) { /* * We are forced to consider this probe @@ -1711,8 +1743,8 @@ phyint_inst_timer(struct phyint_instance *pii) * when the timer fires, we find 2 valid * unacked probes, and they are yet to timeout */ - pr_statp->pr_status = PR_LOST; pr_statp->pr_time_lost = cur_time; + probe_chstate(pr_statp, pii, PR_LOST); } else { /* * Only the most recent probe can enter @@ -1740,16 +1772,15 @@ phyint_inst_timer(struct phyint_instance *pii) * The timer has fired. Take appropriate action depending * on the current state of the phyint. * - * PI_RUNNING state - Failure detection and failover - * PI_FAILED state - Repair detection and failback + * PI_RUNNING state - Failure detection + * PI_FAILED state - Repair detection */ switch (pii->pii_phyint->pi_state) { case PI_FAILED: /* * If the most recent probe (excluding unacked probes that * are yet to time out) has been acked, check whether the - * phyint is now repaired. If the phyint is repaired, then - * attempt failback, unless it is an inactive standby. + * phyint is now repaired. */ if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { phyint_check_for_repair(pii->pii_phyint); @@ -1760,10 +1791,8 @@ phyint_inst_timer(struct phyint_instance *pii) /* * It's possible our probes have been lost because of a * spanning-tree mandated quiet period on the switch. If so, - * ignore the lost probes and consider the interface to still - * be functioning. + * ignore the lost probes. */ - cur_hrtime = gethrtime(); if (pii->pii_fd_hrtime - cur_hrtime > 0) break; @@ -1771,8 +1800,7 @@ phyint_inst_timer(struct phyint_instance *pii) /* * We have 1 or more failed probes (excluding unacked * probes that are yet to time out). Determine if the - * phyint has failed. If so attempt a failover, - * unless it is an inactive standby + * phyint has failed. */ phyint_inst_check_for_failure(pii); } @@ -1790,16 +1818,16 @@ phyint_inst_timer(struct phyint_instance *pii) * was called, the target list may be empty. */ if (pii->pii_target_next != NULL) { - probe(pii, PROBE_UNI, cur_time); + probe(pii, PROBE_UNI, cur_hrtime); /* * If we have just the one probe target, and we're not using * router targets, try to find another as we presently have * no resilience. */ if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } else { - probe(pii, PROBE_MULTI, cur_time); + probe(pii, PROBE_MULTI, cur_hrtime); } return (interval); } @@ -1859,8 +1887,8 @@ process_link_state_down(struct phyint *pi) /* * Clear the probe statistics arrays, we don't want the repair - * detection logic relying on probes that were succesful prior - * to the link going down. + * detection logic relying on probes that were successful prior + * to the link going down. */ if (PROBE_CAPABLE(pi->pi_v4)) clear_pii_probe_stats(pi->pi_v4); @@ -2016,7 +2044,7 @@ phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) pii->pii_target_next = target_next(cur_tg); } else { target_delete(cur_tg); - probe(pii, PROBE_MULTI, getcurrenttime()); + probe(pii, PROBE_MULTI, gethrtime()); } return (PHYINT_OK); } @@ -2065,13 +2093,13 @@ failure_state(struct phyint_instance *pii) struct probe_success_count psinfo; uint_t pi2_tls; /* time last success */ uint_t pi_tff; /* time first fail */ - struct phyint *pi2; + struct phyint *pi2; struct phyint *pi; struct phyint_instance *pii2; struct phyint_group *pg; - boolean_t alone; + int retval; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("phyint_failed(%s)\n", pii->pii_name); pi = pii->pii_phyint; @@ -2082,24 +2110,13 @@ failure_state(struct phyint_instance *pii) return (PHYINT_OK); /* - * At this point, the link is down, or the phyint is suspect, - * as it has lost NUM_PROBE_FAILS or more probes. If the phyint - * does not belong to any group, or is the only member of the - * group capable of being probed, return PHYINT_FAILURE. + * At this point, the link is down, or the phyint is suspect, as it + * has lost NUM_PROBE_FAILS or more probes. If the phyint does not + * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue + * on to determine whether this should be considered a PHYINT_FAILURE + * or GROUP_FAILURE. */ - alone = _B_TRUE; - if (pg != phyint_anongroup) { - for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { - if (pi2 == pi) - continue; - if (PROBE_CAPABLE(pi2->pi_v4) || - PROBE_CAPABLE(pi2->pi_v6)) { - alone = _B_FALSE; - break; - } - } - } - if (alone) + if (pg == phyint_anongroup) return (PHYINT_FAILURE); /* @@ -2116,6 +2133,7 @@ failure_state(struct phyint_instance *pii) * after it was received, so there is no point looking at the tls * of other phyints. */ + retval = GROUP_FAILURE; for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { /* Exclude ourself from comparison */ if (pi2 == pi) @@ -2123,76 +2141,86 @@ failure_state(struct phyint_instance *pii) if (LINK_DOWN(pi)) { /* - * We use FLAGS_TO_LINK_STATE() to test the - * flags directly, rather then LINK_UP() or - * LINK_DOWN(), as we may not have got round - * to processing the link state for the other - * phyints in the group yet. + * We use FLAGS_TO_LINK_STATE() to test the flags + * directly, rather then LINK_UP() or LINK_DOWN(), as + * we may not have got round to processing the link + * state for the other phyints in the group yet. * - * The check for PI_RUNNING and group - * failure handles the case when the - * group begins to recover. The first - * phyint to recover should not trigger - * a failover from the soon-to-recover - * other phyints to the first recovered - * phyint. PI_RUNNING will be set, and - * pg_groupfailed cleared only after - * receipt of NUM_PROBE_REPAIRS, by - * which time the other phyints should - * have received at least 1 packet, - * and so will not have NUM_PROBE_FAILS. + * The check for PI_RUNNING and group failure handles + * the case when the group begins to recover. + * PI_RUNNING will be set, and group failure cleared + * only after receipt of NUM_PROBE_REPAIRS, by which + * time the other phyints should have received at + * least 1 packet, and so will not have NUM_PROBE_FAILS. */ if ((pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); - } else { - /* - * Need to compare against both IPv4 and - * IPv6 instances. - */ - pii2 = pi2->pi_v4; - if (pii2 != NULL) { - probe_success_info(pii2, NULL, &psinfo); - if (psinfo.ps_tls_valid) { - pi2_tls = psinfo.ps_tls; - /* - * See comment above regarding check - * for PI_RUNNING and group failure. - */ - if (TIME_GT(pi2_tls, pi_tff) && - (pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && - FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); + !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; + } + continue; + } + + if (LINK_DOWN(pi2)) + continue; + + /* + * If there's no probe-based failure detection on this + * interface, and its link is still up, then it's still + * working and thus the group has not failed. + */ + if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { + retval = PHYINT_FAILURE; + break; + } + + /* + * Need to compare against both IPv4 and IPv6 instances. + */ + pii2 = pi2->pi_v4; + if (pii2 != NULL) { + probe_success_info(pii2, NULL, &psinfo); + if (psinfo.ps_tls_valid) { + pi2_tls = psinfo.ps_tls; + /* + * See comment above regarding check + * for PI_RUNNING and group failure. + */ + if (TIME_GT(pi2_tls, pi_tff) && + (pi2->pi_state == PI_RUNNING) && + !GROUP_FAILED(pg) && + FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; } } + } - pii2 = pi2->pi_v6; - if (pii2 != NULL) { - probe_success_info(pii2, NULL, &psinfo); - if (psinfo.ps_tls_valid) { - pi2_tls = psinfo.ps_tls; - /* - * See comment above regarding check - * for PI_RUNNING and group failure. - */ - if (TIME_GT(pi2_tls, pi_tff) && - (pi2->pi_state == PI_RUNNING) && - !GROUP_FAILED(pg) && - FLAGS_TO_LINK_STATE(pi2)) - return (PHYINT_FAILURE); + pii2 = pi2->pi_v6; + if (pii2 != NULL) { + probe_success_info(pii2, NULL, &psinfo); + if (psinfo.ps_tls_valid) { + pi2_tls = psinfo.ps_tls; + /* + * See comment above regarding check + * for PI_RUNNING and group failure. + */ + if (TIME_GT(pi2_tls, pi_tff) && + (pi2->pi_state == PI_RUNNING) && + !GROUP_FAILED(pg) && + FLAGS_TO_LINK_STATE(pi2)) { + retval = PHYINT_FAILURE; + break; } } } } /* - * Change the group state to PG_FAILED if it's not already. + * Update the group state to account for the changes. */ - if (!GROUP_FAILED(pg)) - phyint_group_chstate(pg, PG_FAILED); - - return (GROUP_FAILURE); + phyint_group_refresh_state(pg); + return (retval); } /* @@ -2215,7 +2243,7 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, uint_t timeout; struct target *tg; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("probe_success_info(%s)\n", pii->pii_name); bzero(psinfo, sizeof (*psinfo)); @@ -2248,10 +2276,11 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * not available use the value of the group's probe * interval which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + tg->tg_crtt; + timeout += tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + + timeout += pii->pii_phyint->pi_group->pg_probeint; } @@ -2261,7 +2290,7 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * recent consecutive successes. */ pr_statp->pr_time_lost = timeout; - pr_statp->pr_status = PR_LOST; + probe_chstate(pr_statp, pii, PR_LOST); pi_found_failure = _B_TRUE; if (cur_tg != NULL && tg == cur_tg) { /* @@ -2292,7 +2321,8 @@ probe_success_info(struct phyint_instance *pii, struct target *cur_tg, * the most recent probe success. */ if (!psinfo->ps_tls_valid) { - psinfo->ps_tls = pr_statp->pr_time_acked; + psinfo->ps_tls = + ns2ms(pr_statp->pr_hrtime_ackproc); psinfo->ps_tls_valid = _B_TRUE; } break; @@ -2339,7 +2369,7 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, uint_t timeout; struct target *tg; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("probe_fail_info(%s)\n", pii->pii_name); bzero(pfinfo, sizeof (*pfinfo)); @@ -2377,10 +2407,11 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, * not available use the group's probe interval, * which is a worst case estimate. */ + timeout = ns2ms(pr_statp->pr_hrtime_start); if (tg->tg_crtt != 0) { - timeout = pr_statp->pr_time_sent + tg->tg_crtt; + timeout += tg->tg_crtt; } else { - timeout = pr_statp->pr_time_sent + + timeout += pii->pii_phyint->pi_group->pg_probeint; } @@ -2388,7 +2419,7 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, break; pr_statp->pr_time_lost = timeout; - pr_statp->pr_status = PR_LOST; + probe_chstate(pr_statp, pii, PR_LOST); /* FALLTHRU */ case PR_LOST: @@ -2421,6 +2452,19 @@ probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, } /* + * Change the state of probe `pr' on phyint_instance `pii' to state `state'. + */ +void +probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) +{ + if (pr->pr_status == state) + return; + + pr->pr_status = state; + (void) probe_state_event(pr, pii); +} + +/* * Check if the phyint has been repaired. If no test address has been * configured, then consider the interface repaired if the link is up (unless * the link is flapping; see below). Otherwise, look for proof of probes @@ -2436,7 +2480,7 @@ phyint_repaired(struct phyint *pi) int pr_ndx; uint_t cur_time; - if (debug & D_FAILOVER) + if (debug & D_FAILREP) logdebug("phyint_repaired(%s)\n", pi->pi_name); if (LINK_DOWN(pi)) @@ -2458,7 +2502,7 @@ phyint_repaired(struct phyint *pi) } if (!pi->pi_lfmsg_printed) { logerr("The link has come up on %s more than %d times " - "in the last minute; disabling failback until it " + "in the last minute; disabling repair until it " "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); pi->pi_lfmsg_printed = 1; } @@ -2490,354 +2534,41 @@ phyint_repaired(struct phyint *pi) } /* - * Try failover from phyint 'pi' to a suitable destination. - */ -int -try_failover(struct phyint *pi, int failover_type) -{ - struct phyint *dst; - int err; - - if (debug & D_FAILOVER) - logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); - - /* - * Attempt to find a failover destination 'dst'. - * dst will be null if any of the following is true - * Phyint is not part of a group OR - * Phyint is the only member of a group OR - * No suitable failover dst was available - */ - dst = get_failover_dst(pi, failover_type); - if (dst == NULL) - return (IPMP_EMINRED); - - dst->pi_empty = 0; /* Per state diagram */ - pi->pi_full = 0; /* Per state diagram */ - - err = failover(pi, dst); - - if (debug & D_FAILOVER) { - logdebug("failed over from %s to %s ret %d\n", - pi->pi_name, dst->pi_name, err); - } - if (err == 0) { - pi->pi_empty = 1; /* Per state diagram */ - /* - * we don't want to print out this message if a - * phyint is leaving the group, nor for failover from - * standby - */ - if (failover_type == FAILOVER_NORMAL) { - logerr("Successfully failed over from NIC %s to NIC " - "%s\n", pi->pi_name, dst->pi_name); - } - return (0); - } else { - /* - * The failover did not succeed. We must retry the failover - * only after resyncing our state based on the kernel's. - * For eg. either the src or the dst might have been unplumbed - * causing this failure. initifs() will be called again, - * from main, since full_scan_required has been set to true - * by failover(); - */ - return (IPMP_FAILURE); - } -} - -/* - * global_errno captures the errno value, if failover() or failback() - * fails. This is sent to if_mpadm(1M). - */ -int global_errno; - -/* - * Attempt failover from phyint 'from' to phyint 'to'. - * IP moves everything from phyint 'from' to phyint 'to'. - */ -static int -failover(struct phyint *from, struct phyint *to) -{ - struct lifreq lifr; - int ret; - - if (debug & D_FAILOVER) { - logdebug("failing over from %s to %s\n", - from->pi_name, to->pi_name); - } - - /* - * Perform the failover. Both IPv4 and IPv6 are failed over - * using a single ioctl by passing in AF_UNSPEC family. - */ - lifr.lifr_addr.ss_family = AF_UNSPEC; - (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_movetoindex = to->pi_ifindex; - - ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); - if (ret < 0) { - global_errno = errno; - logperror("failover: ioctl (failover)"); - } - - /* - * Set full_scan_required to true. This will make us read - * the state from the kernel in initifs() and update our tables, - * to reflect the current state after the failover. If the - * failover has failed it will then reissue the failover. - */ - full_scan_required = _B_TRUE; - return (ret); -} - -/* - * phyint 'pi' has recovered. Attempt failback from every phyint in the same - * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. - * Return values: - * IPMP_SUCCESS: Failback successful from each of the other - * phyints in the group. - * IPMP_EFBPARTIAL: Failback successful from some of the other - * phyints in the group. - * IPMP_FAILURE: Failback syscall failed with some error. - * - * Note that failback is attempted regardless of the setting of the - * failback_enabled flag. - */ -int -do_failback(struct phyint *pi) -{ - struct phyint *from; - boolean_t done; - boolean_t partial; - boolean_t attempted_failback = _B_FALSE; - - if (debug & D_FAILOVER) - logdebug("do_failback(%s)\n", pi->pi_name); - - /* If this phyint is not part of a named group, return. */ - if (pi->pi_group == phyint_anongroup) { - pi->pi_full = 1; - return (IPMP_SUCCESS); - } - - /* - * Attempt failback from every phyint in the group to 'pi'. - * The reason for doing this, instead of only from the - * phyint to which we did the failover is given below. - * - * After 'pi' failed, if any app. tries to join on a multicast - * address (IPv6), on the failed phyint, IP picks any arbitrary - * non-failed phyint in the group, instead of the failed phyint, - * in.mpathd is not aware of this. Thus failing back only from the - * interface to which 'pi' failed over, will failback the ipif's - * but not the ilm's. So we need to failback from all members of - * the phyint group - */ - done = _B_TRUE; - partial = _B_FALSE; - for (from = pi->pi_group->pg_phyint; from != NULL; - from = from->pi_pgnext) { - /* Exclude ourself as a failback src */ - if (from == pi) - continue; - - /* - * If the 'from' phyint has IPv4 plumbed, the 'to' - * phyint must also have IPv4 plumbed. Similar check - * for IPv6. IP makes the same check. Otherwise the - * failback will fail. - */ - if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || - (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { - partial = _B_TRUE; - continue; - } - - pi->pi_empty = 0; /* Per state diagram */ - attempted_failback = _B_TRUE; - if (failback(from, pi) != 0) { - done = _B_FALSE; - break; - } - } - - /* - * We are done. No more phyint from which we can src the failback - */ - if (done) { - if (!partial) - pi->pi_full = 1; /* Per state diagram */ - /* - * Don't print out a message unless there is a - * transition from FAILED to RUNNING. For eg. - * we don't want to print out this message if a - * phyint is leaving the group, or at startup - */ - if (attempted_failback && (pi->pi_flags & - (IFF_FAILED | IFF_OFFLINE))) { - logerr("Successfully failed back to NIC %s\n", - pi->pi_name); - } - return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); - } - - return (IPMP_FAILURE); -} - -/* - * This function is similar to do_failback() above, but respects the - * failback_enabled flag for phyints in named groups. - */ -int -try_failback(struct phyint *pi) -{ - if (debug & D_FAILOVER) - logdebug("try_failback(%s)\n", pi->pi_name); - - if (pi->pi_group != phyint_anongroup && !failback_enabled) - return (IPMP_EFBDISABLED); - - return (do_failback(pi)); -} - -/* - * Failback everything from phyint 'from' that has the same ifindex - * as phyint to's ifindex. - */ -static int -failback(struct phyint *from, struct phyint *to) -{ - struct lifreq lifr; - int ret; - - if (debug & D_FAILOVER) - logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); - - lifr.lifr_addr.ss_family = AF_UNSPEC; - (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_movetoindex = to->pi_ifindex; - - ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); - if (ret < 0) { - global_errno = errno; - logperror("failback: ioctl (failback)"); - } - - /* - * Set full_scan_required to true. This will make us read - * the state from the kernel in initifs() and update our tables, - * to reflect the current state after the failback. If the - * failback has failed it will then reissue the failback. - */ - full_scan_required = _B_TRUE; - - return (ret); -} - -/* - * Select a target phyint for failing over from 'pi'. - * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred - * target phyint is chosen as follows, - * 1. Pick any inactive standby interface. - * 2. If no inactive standby is available, select any phyint in the - * same group that has the least number of logints, (excluding - * IFF_NOFAILOVER and !IFF_UP logints) - * If we are failing over from a standby, failover_type is - * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. - * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, - * and we won't return NULL, as long as there is at least 1 other phyint - * in the group. - */ -static struct phyint * -get_failover_dst(struct phyint *pi, int failover_type) -{ - struct phyint *maybe = NULL; - struct phyint *pi2; - struct phyint *last_choice = NULL; - - if (pi->pi_group == phyint_anongroup) - return (NULL); - - /* - * Loop thru the phyints in the group, and pick the preferred - * phyint for the target. - */ - for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { - /* Exclude ourself and offlined interfaces */ - if (pi2 == pi || pi2->pi_state == PI_OFFLINE) - continue; - - /* - * The chosen target phyint must have IPv4 instance - * plumbed, if the src phyint has IPv4 plumbed. Similarly - * for IPv6. - */ - if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || - (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) - continue; - - /* The chosen target must be PI_RUNNING. */ - if (pi2->pi_state != PI_RUNNING) { - last_choice = pi2; - continue; - } - - if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && - (failover_type != FAILOVER_TO_NONSTANDBY)) { - return (pi2); - } else { - if (maybe == NULL) - maybe = pi2; - else if (logint_upcount(pi2) < logint_upcount(maybe)) - maybe = pi2; - } - } - if (maybe == NULL && failover_type == FAILOVER_TO_ANY) - return (last_choice); - else - return (maybe); -} - -/* * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. */ boolean_t -change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) +change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) { int ifsock; struct lifreq lifr; uint64_t old_flags; - if (debug & D_FAILOVER) { - logdebug("change_lif_flags(%s): flags %llx setfl %d\n", - pi->pi_name, flags, (int)setfl); + if (debug & D_FAILREP) { + logdebug("change_pif_flags(%s): set %llx clear %llx\n", + pi->pi_name, set, clear); } - if (pi->pi_v4 != NULL) { + if (pi->pi_v4 != NULL) ifsock = ifsock_v4; - } else { + else ifsock = ifsock_v6; - } /* * Get the current flags from the kernel, and set/clear the * desired phyint flags. Since we set only phyint flags, we can * do it on either IPv4 or IPv6 instance. */ - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); + if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) - logperror("change_lif_flags: ioctl (get flags)"); + logperror("change_pif_flags: ioctl (get flags)"); return (_B_FALSE); } old_flags = lifr.lifr_flags; - if (setfl) - lifr.lifr_flags |= flags; - else - lifr.lifr_flags &= ~flags; + lifr.lifr_flags |= set; + lifr.lifr_flags &= ~clear; if (old_flags == lifr.lifr_flags) { /* No change in the flags. No need to send ioctl */ @@ -2846,7 +2577,7 @@ change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) - logperror("change_lif_flags: ioctl (set flags)"); + logperror("change_pif_flags: ioctl (set flags)"); return (_B_FALSE); } @@ -2854,15 +2585,13 @@ change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) * Keep pi_flags in synch. with actual flags. Assumes flags are * phyint flags. */ - if (setfl) - pi->pi_flags |= flags; - else - pi->pi_flags &= ~flags; + pi->pi_flags |= set; + pi->pi_flags &= ~clear; - if (pi->pi_v4) + if (pi->pi_v4 != NULL) pi->pi_v4->pii_flags = pi->pi_flags; - if (pi->pi_v6) + if (pi->pi_v6 != NULL) pi->pi_v6->pii_flags = pi->pi_flags; return (_B_TRUE); @@ -2928,18 +2657,31 @@ reset_snxt_basetimes(void) * and it is up, it is not possible to detect the interface failure. * SIOCTMYADDR also doesn't consider local zone address as own address. * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they - * are stored in laddr_list. + * are stored in `localaddrs' */ - boolean_t own_address(struct in6_addr addr) { - struct local_addr *taddr = laddr_list; + addrlist_t *addrp; + struct sockaddr_storage ss; + int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; - for (; taddr != NULL; taddr = taddr->next) { - if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) { + addr2storage(af, &addr, &ss); + for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { + if (sockaddrcmp(&ss, &addrp->al_addr)) return (_B_TRUE); - } } return (_B_FALSE); } + +static int +ns2ms(int64_t ns) +{ + return (ns / (NANOSEC / MILLISEC)); +} + +static int64_t +tv2ns(struct timeval *tvp) +{ + return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); +} diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c index b56648cf12..def08d39ce 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mpd_defs.h" #include "mpd_tables.h" @@ -47,11 +45,7 @@ static void phyint_inst_print(struct phyint_instance *pii); static void phyint_insert(struct phyint *pi, struct phyint_group *pg); static void phyint_delete(struct phyint *pi); - -static void phyint_group_insert(struct phyint_group *pg); -static void phyint_group_delete(struct phyint_group *pg); -static struct phyint_group *phyint_group_lookup(const char *pg_name); -static struct phyint_group *phyint_group_create(const char *pg_name); +static boolean_t phyint_is_usable(struct phyint *pi); static void logint_print(struct logint *li); static void logint_insert(struct phyint_instance *pii, struct logint *li); @@ -68,16 +62,13 @@ static void reset_pii_probes(struct phyint_instance *pii, struct target *tg); static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii); static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii); -static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask); -static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2, - int prefix_len); - static int phyint_state_event(struct phyint_group *pg, struct phyint *pi); static int phyint_group_state_event(struct phyint_group *pg); static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t); static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi, ipmp_if_op_t op); +static int logint_upcount(struct phyint *pi); static uint64_t gensig(void); /* Initialize any per-file global state. Returns 0 on success, -1 on failure */ @@ -110,6 +101,183 @@ phyint_lookup(const char *name) return (pi); } +/* + * Lookup a phyint in the group that has the same hardware address as `pi', or + * NULL if there's none. If `online_only' is set, then only online phyints + * are considered when matching. Otherwise, phyints that had been offlined + * due to a duplicate hardware address will also be considered. + */ +static struct phyint * +phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only) +{ + struct phyint *pi2; + + if (pi->pi_group == phyint_anongroup) + return (NULL); + + for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi2 == pi) + continue; + + /* + * NOTE: even when online_only is B_FALSE, we ignore phyints + * that are administratively offline (rather than offline + * because they're dups); when they're brought back online, + * they'll be flagged as dups if need be. + */ + if (pi2->pi_state == PI_OFFLINE && + (online_only || !pi2->pi_hwaddrdup)) + continue; + + if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen && + bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0) + return (pi2); + } + return (NULL); +} + +/* + * Respond to DLPI notifications. Currently, this only processes physical + * address changes for the phyint passed via `arg' by onlining or offlining + * phyints in the group. + */ +/* ARGSUSED */ +static void +phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg) +{ + struct phyint *pi = arg; + struct phyint *oduppi = NULL, *duppi = NULL; + + assert((dnip->dni_note & pi->pi_notes) != 0); + + if (dnip->dni_note != DL_NOTE_PHYS_ADDR) + return; + + assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX); + + /* + * If our hardware address hasn't changed, there's nothing to do. + */ + if (pi->pi_hwaddrlen == dnip->dni_physaddrlen && + bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0) + return; + + oduppi = phyint_lookup_hwaddr(pi, _B_FALSE); + pi->pi_hwaddrlen = dnip->dni_physaddrlen; + (void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen); + duppi = phyint_lookup_hwaddr(pi, _B_FALSE); + + if (oduppi != NULL || pi->pi_hwaddrdup) { + /* + * Our old hardware address was a duplicate. If we'd been + * offlined because of it, and our new hardware address is not + * a duplicate, then bring us online. Otherwise, `oduppi' + * must've been the one brought offline; bring it online. + */ + if (pi->pi_hwaddrdup) { + if (duppi == NULL) + (void) phyint_undo_offline(pi); + } else { + assert(oduppi->pi_hwaddrdup); + (void) phyint_undo_offline(oduppi); + } + } + + if (duppi != NULL && !pi->pi_hwaddrdup) { + /* + * Our new hardware address was a duplicate and we're not + * yet flagged as a duplicate; bring us offline. + */ + pi->pi_hwaddrdup = _B_TRUE; + (void) phyint_offline(pi, 0); + } +} + +/* + * Initialize information about the underlying link for `pi', and set us + * up to be notified about future changes. Returns _B_TRUE on success. + */ +boolean_t +phyint_link_init(struct phyint *pi) +{ + int retval; + uint_t notes; + const char *errmsg; + dlpi_notifyid_t id; + + pi->pi_notes = 0; + retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0); + if (retval != DLPI_SUCCESS) { + pi->pi_dh = NULL; + errmsg = "cannot open"; + goto failed; + } + + pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX; + retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr, + &pi->pi_hwaddrlen); + if (retval != DLPI_SUCCESS) { + errmsg = "cannot get hardware address"; + goto failed; + } + + retval = dlpi_bind(pi->pi_dh, DLPI_ANY_SAP, NULL); + if (retval != DLPI_SUCCESS) { + errmsg = "cannot bind to DLPI_ANY_SAP"; + goto failed; + } + + /* + * Check if the link supports DLPI link state notifications. For + * historical reasons, the actual changes are tracked through routing + * sockets, so we immediately disable the notification upon success. + */ + notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN; + retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id); + if (retval == DLPI_SUCCESS) { + (void) dlpi_disabnotify(pi->pi_dh, id, NULL); + pi->pi_notes |= notes; + } + + /* + * Enable notification of hardware address changes to keep pi_hwaddr + * up-to-date and track if we need to offline/undo-offline phyints. + */ + notes = DL_NOTE_PHYS_ADDR; + retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id); + if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0) + pi->pi_notes |= notes; + + return (_B_TRUE); +failed: + logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval)); + if (pi->pi_dh != NULL) { + dlpi_close(pi->pi_dh); + pi->pi_dh = NULL; + } + return (_B_FALSE); +} + +/* + * Close use of link on `pi'. + */ +void +phyint_link_close(struct phyint *pi) +{ + if (pi->pi_notes & DL_NOTE_PHYS_ADDR) { + (void) poll_remove(dlpi_fd(pi->pi_dh)); + pi->pi_notes &= ~DL_NOTE_PHYS_ADDR; + } + + /* + * NOTE: we don't clear pi_notes here so that iflinkstate() can still + * properly report the link state even when offline (which is possible + * since we use IFF_RUNNING to track link state). + */ + dlpi_close(pi->pi_dh); + pi->pi_dh = NULL; +} + /* Return the phyint instance with the given name and the given family */ struct phyint_instance * phyint_inst_lookup(int af, char *name) @@ -128,7 +296,7 @@ phyint_inst_lookup(int af, char *name) return (PHYINT_INSTANCE(pi, af)); } -static struct phyint_group * +struct phyint_group * phyint_group_lookup(const char *pg_name) { struct phyint_group *pg; @@ -173,6 +341,9 @@ phyint_insert(struct phyint *pi, struct phyint_group *pg) pi->pi_pgnext->pi_pgprev = pi; pg->pg_phyint = pi; + /* Refresh the group state now that this phyint has been added */ + phyint_group_refresh_state(pg); + pg->pg_sig++; (void) phyint_group_member_event(pg, pi, IPMP_IF_ADD); } @@ -214,24 +385,24 @@ phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex, } /* - * Record the phyint values. Also insert the phyint into the - * phyint group by calling phyint_insert(). + * Record the phyint values. */ (void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name)); pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; pi->pi_ifindex = ifindex; - pi->pi_icmpid = - htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF)); + pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF)); + /* - * We optimistically start in the PI_RUNNING state. Later (in - * process_link_state_changes()), we will readjust this to match the + * If the interface is offline, we set the state to PI_OFFLINE. + * Otherwise, we optimistically start in the PI_RUNNING state. Later + * (in process_link_state_changes()), we will adjust this to match the * current state of the link. Further, if test addresses are * subsequently assigned, we will transition to PI_NOTARGETS and then - * either PI_RUNNING or PI_FAILED, depending on the result of the test - * probes. + * to either PI_RUNNING or PI_FAILED depending on the probe results. */ - pi->pi_state = PI_RUNNING; + pi->pi_state = (flags & IFF_OFFLINE) ? PI_OFFLINE : PI_RUNNING; pi->pi_flags = PHYINT_FLAGS(flags); + /* * Initialise the link state. The link state is initialised to * up, so that if the link is down when IPMP starts monitoring @@ -241,19 +412,17 @@ phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex, */ INIT_LINK_STATE(pi); + if (!phyint_link_init(pi)) { + free(pi); + return (NULL); + } + /* * Insert the phyint in the list of all phyints, and the * list of phyint group members */ phyint_insert(pi, pg); - /* - * If we are joining a failed group, mark the interface as - * failed. - */ - if (GROUP_FAILED(pg)) - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); - return (pi); } @@ -313,15 +482,14 @@ phyint_chstate(struct phyint *pi, enum pi_state state) return; pi->pi_state = state; - pi->pi_group->pg_sig++; - (void) phyint_state_event(pi->pi_group, pi); + phyint_changed(pi); } /* - * Note that the type of phyint `pi' has changed. + * Note that `pi' has changed state. */ void -phyint_newtype(struct phyint *pi) +phyint_changed(struct phyint *pi) { pi->pi_group->pg_sig++; (void) phyint_state_event(pi->pi_group, pi); @@ -331,7 +499,7 @@ phyint_newtype(struct phyint *pi) * Insert the phyint group in the linked list of all phyint groups * at the head of the list */ -static void +void phyint_group_insert(struct phyint_group *pg) { pg->pg_next = phyint_groups; @@ -347,7 +515,7 @@ phyint_group_insert(struct phyint_group *pg) /* * Create a new phyint group called 'name'. */ -static struct phyint_group * +struct phyint_group * phyint_group_create(const char *name) { struct phyint_group *pg; @@ -363,9 +531,16 @@ phyint_group_create(const char *name) (void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name)); pg->pg_sig = gensig(); - pg->pg_fdt = user_failure_detection_time; pg->pg_probeint = user_probe_interval; + pg->pg_in_use = _B_TRUE; + + /* + * Normal groups always start in the PG_FAILED state since they + * have no active interfaces. In contrast, anonymous groups are + * heterogeneous and thus always PG_OK. + */ + pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED); return (pg); } @@ -378,10 +553,20 @@ phyint_group_chstate(struct phyint_group *pg, enum pg_state state) { assert(pg != phyint_anongroup); + /* + * To simplify things, some callers always set a given state + * regardless of the previous state of the group (e.g., setting + * PG_DEGRADED when it's already set). We shouldn't bother + * generating an event or consuming a signature for these, since + * the actual state of the group is unchanged. + */ + if (pg->pg_state == state) + return; + + pg->pg_state = state; + switch (state) { case PG_FAILED: - pg->pg_groupfailed = 1; - /* * We can never know with certainty that a group has * failed. It is possible that all known targets have @@ -392,16 +577,15 @@ phyint_group_chstate(struct phyint_group *pg, enum pg_state state) * hosts, we have to discover it by multicast. So flush * all the host targets. The next probe will send out a * multicast echo request. If this is a group failure, we - * will still not see any response, otherwise we will - * clear the pg_groupfailed flag after we get - * NUM_PROBE_REPAIRS consecutive unicast replies on any - * phyint. + * will still not see any response, otherwise the group + * will be repaired after we get NUM_PROBE_REPAIRS + * consecutive unicast replies on any phyint. */ target_flush_hosts(pg); break; - case PG_RUNNING: - pg->pg_groupfailed = 0; + case PG_OK: + case PG_DEGRADED: break; default: @@ -432,7 +616,6 @@ phyint_inst_init_from_k(int af, char *pi_name) struct lifreq lifr; struct phyint *pi; struct phyint_instance *pii; - boolean_t pg_created; boolean_t pi_created; struct phyint_group *pg; @@ -441,7 +624,6 @@ retry: pi = NULL; pg = NULL; pi_created = _B_FALSE; - pg_created = _B_FALSE; if (debug & D_PHYINT) { logdebug("phyint_inst_init_from_k(%s %s)\n", @@ -454,11 +636,11 @@ retry: ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; /* - * Get the interface flags. Ignore loopback and multipoint - * interfaces. + * Get the interface flags. Ignore virtual interfaces, IPMP + * meta-interfaces, point-to-point interfaces, and interfaces + * that can't support multicast. */ - (void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name)); if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) { logperror("phyint_inst_init_from_k:" @@ -467,7 +649,8 @@ retry: return (NULL); } flags = lifr.lifr_flags; - if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK)) + if (!(flags & IFF_MULTICAST) || + (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT))) return (NULL); /* @@ -493,8 +676,7 @@ retry: } return (NULL); } - (void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name)); - pg_name[sizeof (pg_name) - 1] = '\0'; + (void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name)); /* * If the phyint is not part of any group, pg_name is the @@ -503,12 +685,13 @@ retry: */ if (pg_name[0] == '\0' && !track_all_phyints) { /* - * If the IFF_FAILED or IFF_OFFLINE flags are set, reset - * them. These flags shouldn't be set if IPMP isn't - * tracking the interface. + * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are + * set, reset them. These flags shouldn't be set if in.mpathd + * isn't tracking the interface. */ - if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) { - lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE); + if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) { + lifr.lifr_flags = flags & + ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE); if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) { logperror("phyint_inst_init_from_k:" @@ -520,21 +703,20 @@ retry: } /* - * We need to create a new phyint instance. A phyint instance - * belongs to a phyint, and the phyint belongs to a phyint group. - * So we first lookup the 'parents' and if they don't exist then - * we create them. + * We need to create a new phyint instance. We may also need to + * create the group if e.g. the SIOCGLIFCONF loop in initifs() found + * an underlying interface before it found its IPMP meta-interface. + * Note that we keep any created groups even if phyint_inst_from_k() + * fails since a group's existence is not dependent on the ability of + * in.mpathd to the track the group's interfaces. */ - pg = phyint_group_lookup(pg_name); - if (pg == NULL) { - pg = phyint_group_create(pg_name); - if (pg == NULL) { - logerr("phyint_inst_init_from_k:" - " unable to create group %s\n", pg_name); + if ((pg = phyint_group_lookup(pg_name)) == NULL) { + if ((pg = phyint_group_create(pg_name)) == NULL) { + logerr("phyint_inst_init_from_k: cannot create group " + "%s\n", pg_name); return (NULL); } phyint_group_insert(pg); - pg_created = _B_TRUE; } /* @@ -546,8 +728,6 @@ retry: if (pi == NULL) { logerr("phyint_inst_init_from_k:" " unable to create phyint %s\n", pi_name); - if (pg_created) - phyint_group_delete(pg); return (NULL); } pi_created = _B_TRUE; @@ -564,8 +744,6 @@ retry: * while we are yet to update our tables. Do it now. */ if (pi->pi_ifindex != ifindex) { - if (pg_created) - phyint_group_delete(pg); phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af))); goto retry; } @@ -577,9 +755,6 @@ retry: * changed, while we are yet to update our tables. Do it now. */ if (strcmp(pi->pi_group->pg_name, pg_name) != 0) { - if (pg_created) - phyint_group_delete(pg); - restore_phyint(pi); phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af))); goto retry; @@ -594,16 +769,25 @@ retry: if (pii == NULL) { logerr("phyint_inst_init_from_k: unable to create" "phyint inst %s\n", pi->pi_name); - if (pi_created) { - /* - * Deleting the phyint will delete the phyint group - * if this is the last phyint in the group. - */ + if (pi_created) phyint_delete(pi); - } + return (NULL); } + if (pi_created) { + /* + * If this phyint does not have a unique hardware address in its + * group, offline it. (The change_pif_flags() implementation + * requires that we defer this until after the phyint_instance + * is created.) + */ + if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) { + pi->pi_hwaddrdup = _B_TRUE; + (void) phyint_offline(pi, 0); + } + } + return (pii); } @@ -677,16 +861,16 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) { icmp6_filter_t filter; int hopcount = 1; - int int_op; + int off = 0; + int on = 1; struct sockaddr_in6 testaddr; /* * Open a raw socket with ICMPv6 protocol. * - * Use IPV6_DONTFAILOVER_IF to make sure that probes go out - * on the specified phyint only, and are not subject to load - * balancing. Bind to the src address chosen will ensure that - * the responses are received only on the specified phyint. + * Use IPV6_BOUND_IF to make sure that probes are sent and received on + * the specified phyint only. Bind to the test address to ensure that + * the responses are sent to the specified phyint. * * Set the hopcount to 1 so that probe packets are not routed. * Disable multicast loopback. Set the receive filter to @@ -696,7 +880,7 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) if (pii->pii_probe_sock < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: socket"); return (_B_FALSE); -} + } bzero(&testaddr, sizeof (testaddr)); testaddr.sin6_family = AF_INET6; @@ -709,14 +893,17 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* - * IPV6_DONTFAILOVER_IF option takes precedence over setting - * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again. - */ - if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF, + if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF, (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" - " IPV6_DONTFAILOVER_IF"); + " IPV6_MULTICAST_IF"); + return (_B_FALSE); + } + + if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF, + &pii->pii_ifindex, sizeof (uint_t)) < 0) { + logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" + " IPV6_BOUND_IF"); return (_B_FALSE); } @@ -734,9 +921,8 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - int_op = 0; /* used to turn off option */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, - (char *)&int_op, sizeof (int_op)) < 0) { + (char *)&off, sizeof (off)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" " IPV6_MULTICAST_LOOP"); return (_B_FALSE); @@ -755,15 +941,22 @@ phyint_inst_v6_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* Enable receipt of ancillary data */ - int_op = 1; + /* Enable receipt of hoplimit */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT, - (char *)&int_op, sizeof (int_op)) < 0) { + &on, sizeof (on)) < 0) { logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" " IPV6_RECVHOPLIMIT"); return (_B_FALSE); } + /* Enable receipt of timestamp */ + if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, + &on, sizeof (on)) < 0) { + logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" + " SO_TIMESTAMP"); + return (_B_FALSE); + } + return (_B_TRUE); } @@ -775,20 +968,20 @@ static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii) { struct sockaddr_in testaddr; - char char_op; + char char_off = 0; int ttl = 1; char char_ttl = 1; + int on = 1; /* * Open a raw socket with ICMPv4 protocol. * - * Use IP_DONTFAILOVER_IF to make sure that probes go out - * on the specified phyint only, and are not subject to load - * balancing. Bind to the src address chosen will ensure that - * the responses are received only on the specified phyint. + * Use IP_BOUND_IF to make sure that probes are sent and received on + * the specified phyint only. Bind to the test address to ensure that + * the responses are sent to the specified phyint. * * Set the ttl to 1 so that probe packets are not routed. - * Disable multicast loopback. + * Disable multicast loopback. Enable receipt of timestamp. */ pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP); if (pii->pii_probe_sock < 0) { @@ -808,14 +1001,17 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - /* - * IP_DONTFAILOVER_IF option takes precedence over setting - * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again. - */ - if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF, + if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF, + &pii->pii_ifindex, sizeof (uint_t)) < 0) { + logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" + " IP_BOUND_IF"); + return (_B_FALSE); + } + + if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF, (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) { logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" - " IP_DONTFAILOVER"); + " IP_MULTICAST_IF"); return (_B_FALSE); } @@ -826,9 +1022,8 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } - char_op = 0; /* used to turn off option */ if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP, - (char *)&char_op, sizeof (char_op)) == -1) { + (char *)&char_off, sizeof (char_off)) == -1) { logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" " IP_MULTICAST_LOOP"); return (_B_FALSE); @@ -841,6 +1036,13 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) return (_B_FALSE); } + if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on, + sizeof (on)) < 0) { + logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" + " SO_TIMESTAMP"); + return (_B_FALSE); + } + return (_B_TRUE); } @@ -848,7 +1050,7 @@ phyint_inst_v4_sockinit(struct phyint_instance *pii) * Remove the phyint group from the list of 'all phyint groups' * and free it. */ -static void +void phyint_group_delete(struct phyint_group *pg) { /* @@ -881,10 +1083,69 @@ phyint_group_delete(struct phyint_group *pg) phyint_grouplistsig++; (void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE); + addrlist_free(&pg->pg_addrs); free(pg); } /* + * Refresh the state of `pg' based on its current members. + */ +void +phyint_group_refresh_state(struct phyint_group *pg) +{ + enum pg_state state; + enum pg_state origstate = pg->pg_state; + struct phyint *pi, *usablepi; + uint_t nif = 0, nusable = 0; + + /* + * Anonymous groups never change state. + */ + if (pg == phyint_anongroup) + return; + + for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { + nif++; + if (phyint_is_usable(pi)) { + nusable++; + usablepi = pi; + } + } + + if (nusable == 0) + state = PG_FAILED; + else if (nif == nusable) + state = PG_OK; + else + state = PG_DEGRADED; + + phyint_group_chstate(pg, state); + + /* + * If we're shutting down, skip logging messages since otherwise our + * shutdown housecleaning will make us report that groups are unusable. + */ + if (cleanup_started) + return; + + /* + * NOTE: We use pg_failmsg_printed rather than origstate since + * otherwise at startup we'll log a "now usable" message when the + * first usable phyint is added to an empty group. + */ + if (state != PG_FAILED && pg->pg_failmsg_printed) { + assert(origstate == PG_FAILED); + logerr("At least 1 IP interface (%s) in group %s is now " + "usable\n", usablepi->pi_name, pg->pg_name); + pg->pg_failmsg_printed = _B_FALSE; + } else if (origstate != PG_FAILED && state == PG_FAILED) { + logerr("All IP interfaces in group %s are now unusable\n", + pg->pg_name); + pg->pg_failmsg_printed = _B_TRUE; + } +} + +/* * Extract information from the kernel about the desired phyint. * Look only for properties of the phyint and not properties of logints. * Take appropriate action on the changes. @@ -998,28 +1259,16 @@ phyint_inst_update_from_k(struct phyint_instance *pii) if (pi->pi_v6 != NULL) pi->pi_v6->pii_flags = pi->pi_flags; + /* + * Make sure the IFF_FAILED flag is set if and only if we think + * the interface should be failed. + */ if (pi->pi_flags & IFF_FAILED) { - /* - * If we are in the running and full state, we have - * completed failbacks successfully and we would have - * expected IFF_FAILED to have been clear. That it is - * set means there was a race condition. Some other - * process turned on the IFF_FAILED flag. Since the - * flag setting is not atomic, i.e. a get ioctl followed - * by a set ioctl, and since there is no way to set an - * individual flag bit, this could have occurred. - */ - if (pi->pi_state == PI_RUNNING && pi->pi_full) - (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); + if (pi->pi_state == PI_RUNNING) + (void) change_pif_flags(pi, 0, IFF_FAILED); } else { - /* - * If we are in the failed state, there was a race. - * we have completed failover successfully because our - * state is failed and empty. Some other process turned - * off the IFF_FAILED flag. Same comment as above - */ - if (pi->pi_state == PI_FAILED && pi->pi_empty) - (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); + if (pi->pi_state == PI_FAILED) + (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); } /* No change in phyint status */ @@ -1028,12 +1277,12 @@ phyint_inst_update_from_k(struct phyint_instance *pii) /* * Delete the phyint. Remove it from the list of all phyints, and the - * list of phyint group members. If the group becomes empty, delete the - * group also. + * list of phyint group members. */ static void phyint_delete(struct phyint *pi) { + struct phyint *pi2; struct phyint_group *pg = pi->pi_group; if (debug & D_PHYINT) @@ -1065,6 +1314,9 @@ phyint_delete(struct phyint *pi) pi->pi_pgnext = NULL; pi->pi_pgprev = NULL; + /* Refresh the group state now that this phyint has been removed */ + phyint_group_refresh_state(pg); + /* Remove the phyint from the global list of phyints */ if (pi->pi_prev == NULL) { /* Phyint is the 1st in the list */ @@ -1077,11 +1329,153 @@ phyint_delete(struct phyint *pi) pi->pi_next = NULL; pi->pi_prev = NULL; + /* + * See if another phyint in the group had been offlined because + * it was a dup of `pi' -- and if so, online it. + */ + if (!pi->pi_hwaddrdup && + (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) { + assert(pi2->pi_hwaddrdup); + (void) phyint_undo_offline(pi2); + } + phyint_link_close(pi); free(pi); +} + +/* + * Offline phyint `pi' if at least `minred' usable interfaces remain in the + * group. Returns an IPMP error code. + */ +int +phyint_offline(struct phyint *pi, uint_t minred) +{ + unsigned int nusable = 0; + struct phyint *pi2; + struct phyint_group *pg = pi->pi_group; + + /* + * Verify that enough usable interfaces in the group would remain. + * As a special case, if the group has failed, allow any non-offline + * phyints to be offlined. + */ + if (pg != phyint_anongroup) { + for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { + if (pi2 == pi) + continue; + if (phyint_is_usable(pi2) || + (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE)) + nusable++; + } + } + if (nusable < minred) + return (IPMP_EMINRED); + + if (!change_pif_flags(pi, IFF_OFFLINE, 0)) + return (IPMP_FAILURE); + + /* + * The interface is now offline, so stop probing it. Note that + * if_mpadm(1M) will down the test addresses, after receiving a + * success reply from us. The routing socket message will then make us + * close the socket used for sending probes. But it is more logical + * that an offlined interface must not be probed, even if it has test + * addresses. + * + * NOTE: stop_probing() also sets PI_OFFLINE. + */ + stop_probing(pi); + + /* + * If we're offlining the phyint because it has a duplicate hardware + * address, print a warning -- and leave the link open so that we can + * be notified of hardware address changes that make it usable again. + * Otherwise, close the link so that we won't prevent a detach. + */ + if (pi->pi_hwaddrdup) { + logerr("IP interface %s has a hardware address which is not " + "unique in group %s; offlining\n", pi->pi_name, + pg->pg_name); + } else { + phyint_link_close(pi); + } + + /* + * If this phyint was preventing another phyint with a duplicate + * hardware address from being online, bring that one online now. + */ + if (!pi->pi_hwaddrdup && + (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) { + assert(pi2->pi_hwaddrdup); + (void) phyint_undo_offline(pi2); + } - /* Delete the phyint_group if the last phyint has been deleted */ - if (pg->pg_phyint == NULL) - phyint_group_delete(pg); + /* + * If this interface was active, try to activate another INACTIVE + * interface in the group. + */ + if (!(pi->pi_flags & IFF_INACTIVE)) + phyint_activate_another(pi); + + return (IPMP_SUCCESS); +} + +/* + * Undo a previous offline of `pi'. Returns an IPMP error code. + */ +int +phyint_undo_offline(struct phyint *pi) +{ + if (pi->pi_state != PI_OFFLINE) { + errno = EINVAL; + return (IPMP_FAILURE); + } + + /* + * If necessary, reinitialize our link information and verify that its + * hardware address is still unique across the group. + */ + if (pi->pi_dh == NULL && !phyint_link_init(pi)) { + errno = EIO; + return (IPMP_FAILURE); + } + + if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) { + pi->pi_hwaddrdup = _B_TRUE; + return (IPMP_EHWADDRDUP); + } + + if (pi->pi_hwaddrdup) { + logerr("IP interface %s now has a unique hardware address in " + "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name); + pi->pi_hwaddrdup = _B_FALSE; + } + + if (!change_pif_flags(pi, 0, IFF_OFFLINE)) + return (IPMP_FAILURE); + + /* + * While the interface was offline, it may have failed (e.g. the link + * may have gone down). phyint_inst_check_for_failure() will have + * already set pi_flags with IFF_FAILED, so we can use that to decide + * whether the phyint should transition to running. Note that after + * we transition to running, we will start sending probes again (if + * test addresses are configured), which may also reveal that the + * interface is in fact failed. + */ + if (pi->pi_flags & IFF_FAILED) { + phyint_chstate(pi, PI_FAILED); + } else { + /* calls phyint_chstate() */ + phyint_transition_to_running(pi); + } + + /* + * Give the requestor time to configure test addresses before + * complaining that they're missing. + */ + pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; + + return (IPMP_SUCCESS); } /* @@ -1166,11 +1560,10 @@ phyint_inst_print(struct phyint_instance *pii) } logdebug("\nPhyint instance: %s %s index %u state %x flags %llx " - "sock %x in_use %d empty %x full %x\n", + "sock %x in_use %d\n", AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex, pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock, - pii->pii_in_use, pii->pii_phyint->pi_empty, - pii->pii_phyint->pi_full); + pii->pii_in_use); for (li = pii->pii_logint; li != NULL; li = li->li_next) logint_print(li); @@ -1211,9 +1604,11 @@ phyint_inst_print(struct phyint_instance *pii) } else { logdebug("#%d target NULL ", i); } - logdebug("time_sent %u status %d time_ack/lost %u\n", - pii->pii_probes[i].pr_time_sent, + logdebug("time_start %lld status %d " + "time_ackproc %lld time_lost %u", + pii->pii_probes[i].pr_hrtime_start, pii->pii_probes[i].pr_status, + pii->pii_probes[i].pr_hrtime_ackproc, pii->pii_probes[i].pr_time_lost); i = PROBE_INDEX_PREV(i); } while (i != most_recent); @@ -1293,7 +1688,6 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) struct logint *li; struct lifreq lifr; struct in6_addr test_subnet; - struct in6_addr test_subnet_mask; struct in6_addr testaddr; int test_subnet_len; struct sockaddr_in6 *sin6; @@ -1373,55 +1767,21 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) testaddr = sin6->sin6_addr; } - if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { - ptp = _B_TRUE; - if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) { - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get dstaddr)"); - } - goto error; - } - if (pii->pii_af == AF_INET) { - sin = (struct sockaddr_in *)&lifr.lifr_addr; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr); - } else { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; - tgaddr = sin6->sin6_addr; - } - } else { - if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) { - /* Interface may have vanished */ - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get subnet)"); - } - goto error; - } - if (lifr.lifr_subnet.ss_family == AF_INET6) { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet; - test_subnet = sin6->sin6_addr; - test_subnet_len = lifr.lifr_addrlen; - } else { - sin = (struct sockaddr_in *)&lifr.lifr_subnet; - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet); - test_subnet_len = lifr.lifr_addrlen + - (IPV6_ABITS - IP_ABITS); - } - (void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask); - } - - /* - * Also record the OINDEX for completeness. This information is - * not used. - */ - if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) { - if (errno != ENXIO) { - logperror_li(li, "logint_init_from_k:" - " (get lifoindex)"); - } + if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) { + /* Interface may have vanished */ + if (errno != ENXIO) + logperror_li(li, "logint_init_from_k: (get subnet)"); goto error; } + if (lifr.lifr_subnet.ss_family == AF_INET6) { + sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet; + test_subnet = sin6->sin6_addr; + test_subnet_len = lifr.lifr_addrlen; + } else { + sin = (struct sockaddr_in *)&lifr.lifr_subnet; + IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet); + test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS); + } /* * If this is the logint corresponding to the test address used for @@ -1454,7 +1814,6 @@ logint_init_from_k(struct phyint_instance *pii, char *li_name) /* Update the logint with the values obtained from the kernel. */ li->li_addr = testaddr; li->li_in_use = 1; - li->li_oifindex = lifr.lifr_index; if (ptp) { li->li_dstaddr = tgaddr; li->li_subnet_len = (pii->pii_af == AF_INET) ? @@ -1530,15 +1889,12 @@ static void logint_print(struct logint *li) { char abuf[INET6_ADDRSTRLEN]; - int af; - - af = li->li_phyint_inst->pii_af; + int af = li->li_phyint_inst->pii_af; logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name, pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len); - logdebug("\tFlags: %llx in_use %d oifindex %d\n", - li->li_flags, li->li_in_use, li->li_oifindex); + logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use); } char * @@ -1555,6 +1911,33 @@ pr_addr(int af, struct in6_addr addr, char *abuf, int len) return (abuf); } +/* + * Fill in the sockaddr_storage pointed to by `ssp' with the IP address + * represented by the [`af',`addr'] pair. Needed because in.mpathd internally + * stores all addresses as in6_addrs, but we don't want to expose that. + */ +void +addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp) +{ + struct sockaddr_in *sinp = (struct sockaddr_in *)ssp; + struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp; + + assert(af == AF_INET || af == AF_INET6); + + switch (af) { + case AF_INET: + (void) memset(sinp, 0, sizeof (*sinp)); + sinp->sin_family = AF_INET; + IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr); + break; + case AF_INET6: + (void) memset(sin6p, 0, sizeof (*sin6p)); + sin6p->sin6_family = AF_INET6; + sin6p->sin6_addr = *addr; + break; + } +} + /* Lookup target on its address */ struct target * target_lookup(struct phyint_instance *pii, struct in6_addr addr) @@ -1686,7 +2069,7 @@ target_select_best(struct phyint_instance *pii) if (tg->tg_latime + MIN_RECOVERY_TIME < now) { slow_recovered = tg; /* - * Promote the slow_recoverd to unused + * Promote the slow_recovered to unused */ tg->tg_status = TG_UNUSED; } else { @@ -1698,7 +2081,7 @@ target_select_best(struct phyint_instance *pii) if (tg->tg_latime + MIN_RECOVERY_TIME < now) { dead_recovered = tg; /* - * Promote the dead_recoverd to slow + * Promote the dead_recovered to slow */ tg->tg_status = TG_SLOW; tg->tg_latime = now; @@ -1798,11 +2181,9 @@ target_create(struct phyint_instance *pii, struct in6_addr addr, /* * If there are multiple subnets associated with an interface, then - * add the target to this phyint instance, only if it belongs to the - * same subnet as the test address. The reason is that interface - * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER - * addresses, will disappear after failover, and the targets will not - * be reachable from this interface. + * add the target to this phyint instance only if it belongs to the + * same subnet as the test address. This assures us that we will + * be able to reach this target through our routing table. */ if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len)) return; @@ -1906,11 +2287,12 @@ target_add(struct phyint_instance *pii, struct in6_addr addr, /* * If the target does not exist, create it; target_create() will set - * tg_in_use to true. If it exists already, and it is a router - * target, set tg_in_use to to true, so that init_router_targets() - * won't delete it + * tg_in_use to true. Even if it exists already, if it's a router + * target and we'd previously learned of it through multicast, then we + * need to recreate it as a router target. Otherwise, just set + * tg_in_use to to true so that init_router_targets() won't delete it. */ - if (tg == NULL) + if (tg == NULL || (is_router && !pii->pii_targets_are_routers)) target_create(pii, addr, is_router); else if (is_router) tg->tg_in_use = 1; @@ -2034,16 +2416,17 @@ target_delete(struct target *tg) * relevant any longer. */ assert(pii->pii_targets == NULL); + pii->pii_targets_are_routers = _B_FALSE; clear_pii_probe_stats(pii); pii_other = phyint_inst_other(pii); /* - * If there are no targets on both instances and the interface is - * online, go back to PI_NOTARGETS state, since we cannot probe this - * phyint any more. For more details, please see phyint state - * diagram in mpd_probe.c. + * If there are no targets on both instances and the interface would + * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state, + * since we cannot probe this phyint any more. For more details, + * please see phyint state diagram in mpd_probe.c. */ - if (!PROBE_CAPABLE(pii_other) && + if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) && pii->pii_phyint->pi_state != PI_OFFLINE) phyint_chstate(pii->pii_phyint, PI_NOTARGETS); } @@ -2101,9 +2484,11 @@ reset_pii_probes(struct phyint_instance *pii, struct target *tg) for (i = 0; i < PROBE_STATS_COUNT; i++) { if (pii->pii_probes[i].pr_target == tg) { + if (pii->pii_probes[i].pr_status == PR_UNACKED) { + probe_chstate(&pii->pii_probes[i], pii, + PR_LOST); + } pii->pii_probes[i].pr_target = NULL; - if (pii->pii_probes[i].pr_status == PR_UNACKED) - pii->pii_probes[i].pr_status = PR_LOST; } } @@ -2132,7 +2517,7 @@ target_print(struct target *tg) af = tg->tg_phyint_inst->pii_af; logdebug("Target on %s %s addr %s\n" - "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n", + "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n", AF_STR(af), tg->tg_phyint_inst->pii_name, pr_addr(af, tg->tg_address, abuf, sizeof (abuf)), tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd, @@ -2158,35 +2543,16 @@ phyint_inst_print_all(void) } /* - * Convert length for a mask to the mask. - */ -static void -ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask) -{ - int j; - - assert(masklen <= IPV6_ABITS); - bzero((char *)bitmask, sizeof (*bitmask)); - - /* Make the 'masklen' leftmost bits one */ - for (j = 0; masklen > 8; masklen -= 8, j++) - bitmask->s6_addr[j] = 0xff; - - bitmask->s6_addr[j] = 0xff << (8 - masklen); - -} - -/* * Compare two prefixes that have the same prefix length. * Fails if the prefix length is unreasonable. */ -static boolean_t -prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len) +boolean_t +prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len) { uchar_t mask; int j; - if (prefix_len < 0 || prefix_len > IPV6_ABITS) + if (prefix_len > IPV6_ABITS) return (_B_FALSE); for (j = 0; prefix_len > 8; prefix_len -= 8, j++) @@ -2202,35 +2568,25 @@ prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len) } /* - * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both - * IPv4 and IPv6 put together. The phyint with the least such number - * will be used as the failover destination, if no standby interface is - * available + * Get the number of UP logints on phyint `pi'. */ -int +static int logint_upcount(struct phyint *pi) { struct logint *li; - struct phyint_instance *pii; int count = 0; - pii = pi->pi_v4; - if (pii != NULL) { - for (li = pii->pii_logint; li != NULL; li = li->li_next) { - if ((li->li_flags & - (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { + if (pi->pi_v4 != NULL) { + for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) { + if (li->li_flags & IFF_UP) count++; - } } } - pii = pi->pi_v6; - if (pii != NULL) { - for (li = pii->pii_logint; li != NULL; li = li->li_next) { - if ((li->li_flags & - (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { + if (pi->pi_v6 != NULL) { + for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) { + if (li->li_flags & IFF_UP) count++; - } } } @@ -2250,6 +2606,28 @@ phyint_inst_other(struct phyint_instance *pii) } /* + * Check whether a phyint is functioning. + */ +static boolean_t +phyint_is_functioning(struct phyint *pi) +{ + if (pi->pi_state == PI_RUNNING) + return (_B_TRUE); + return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED)); +} + +/* + * Check whether a phyint is usable. + */ +static boolean_t +phyint_is_usable(struct phyint *pi) +{ + if (logint_upcount(pi) == 0) + return (_B_FALSE); + return (phyint_is_functioning(pi)); +} + +/* * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'. * Before sending the event, it prepends the current version of the IPMP * sysevent API. Returns 0 on success, -1 on failure (in either case, @@ -2258,16 +2636,18 @@ phyint_inst_other(struct phyint_instance *pii) static int post_event(const char *subclass, nvlist_t *nvl) { - sysevent_id_t eid; + static evchan_t *evchp = NULL; /* - * Since sysevents don't work yet in non-global zones, there cannot - * possibly be any consumers yet, so don't bother trying to generate - * them. (Otherwise, we'll spew warnings.) + * Initialize the event channel if we haven't already done so. */ - if (getzoneid() != GLOBAL_ZONEID) { - nvlist_free(nvl); - return (0); + if (evchp == NULL) { + errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT); + if (errno != 0) { + logerr("cannot create event channel `%s': %s\n", + IPMP_EVENT_CHAN, strerror(errno)); + goto failed; + } } errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION, @@ -2278,8 +2658,9 @@ post_event(const char *subclass, nvlist_t *nvl) goto failed; } - if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR, - "in.mpathd", nvl, &eid) == -1) { + errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun", + "in.mpathd", nvl, EVCH_NOSLEEP); + if (errno != 0) { logerr("cannot send `%s' event: %s\n", subclass, strerror(errno)); goto failed; @@ -2300,6 +2681,8 @@ ifstate(struct phyint *pi) { switch (pi->pi_state) { case PI_NOTARGETS: + if (pi->pi_flags & IFF_FAILED) + return (IPMP_IF_FAILED); return (IPMP_IF_UNKNOWN); case PI_OFFLINE: @@ -2330,12 +2713,203 @@ iftype(struct phyint *pi) } /* + * Return the external IPMP link state associated with phyint `pi'. + */ +static ipmp_if_linkstate_t +iflinkstate(struct phyint *pi) +{ + if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN))) + return (IPMP_LINK_UNKNOWN); + + return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP); +} + +/* + * Return the external IPMP probe state associated with phyint `pi'. + */ +static ipmp_if_probestate_t +ifprobestate(struct phyint *pi) +{ + if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) + return (IPMP_PROBE_DISABLED); + + if (pi->pi_state == PI_FAILED) + return (IPMP_PROBE_FAILED); + + if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6)) + return (IPMP_PROBE_UNKNOWN); + + return (IPMP_PROBE_OK); +} + +/* + * Return the external IPMP target mode associated with phyint instance `pii'. + */ +static ipmp_if_targmode_t +iftargmode(struct phyint_instance *pii) +{ + if (!PROBE_ENABLED(pii)) + return (IPMP_TARG_DISABLED); + else if (pii->pii_targets_are_routers) + return (IPMP_TARG_ROUTES); + else + return (IPMP_TARG_MULTICAST); +} + +/* + * Return the external IPMP flags associated with phyint `pi'. + */ +static ipmp_if_flags_t +ifflags(struct phyint *pi) +{ + ipmp_if_flags_t flags = 0; + + if (logint_upcount(pi) == 0) + flags |= IPMP_IFFLAG_DOWN; + if (pi->pi_flags & IFF_INACTIVE) + flags |= IPMP_IFFLAG_INACTIVE; + if (pi->pi_hwaddrdup) + flags |= IPMP_IFFLAG_HWADDRDUP; + if (phyint_is_functioning(pi) && flags == 0) + flags |= IPMP_IFFLAG_ACTIVE; + + return (flags); +} + +/* + * Store the test address used on phyint instance `pii' in `ssp'. If there's + * no test address, 0.0.0.0 is stored. + */ +static struct sockaddr_storage * +iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp) +{ + if (PROBE_ENABLED(pii)) + addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp); + else + addr2storage(AF_INET6, &in6addr_any, ssp); + + return (ssp); +} + +/* * Return the external IPMP group state associated with phyint group `pg'. */ static ipmp_group_state_t groupstate(struct phyint_group *pg) { - return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK); + switch (pg->pg_state) { + case PG_FAILED: + return (IPMP_GROUP_FAILED); + case PG_DEGRADED: + return (IPMP_GROUP_DEGRADED); + case PG_OK: + return (IPMP_GROUP_OK); + } + + logerr("groupstate: unknown state %d; aborting\n", pg->pg_state); + abort(); + /* NOTREACHED */ +} + +/* + * Return the external IPMP probe state associated with probe `ps'. + */ +static ipmp_probe_state_t +probestate(struct probe_stats *ps) +{ + switch (ps->pr_status) { + case PR_UNUSED: + case PR_LOST: + return (IPMP_PROBE_LOST); + case PR_UNACKED: + return (IPMP_PROBE_SENT); + case PR_ACKED: + return (IPMP_PROBE_ACKED); + } + + logerr("probestate: unknown state %d; aborting\n", ps->pr_status); + abort(); + /* NOTREACHED */ +} + +/* + * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr' + * on phyint instance `pii'. Returns 0 on success, -1 on failure. + */ +int +probe_state_event(struct probe_stats *pr, struct phyint_instance *pii) +{ + nvlist_t *nvl; + hrtime_t proc_time = 0, recv_time = 0; + struct sockaddr_storage ss; + struct target *tg = pr->pr_target; + + errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); + if (errno != 0) { + logperror("cannot create `interface change' event"); + return (-1); + } + + errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id); + if (errno != 0) + goto failed; + + errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name); + if (errno != 0) + goto failed; + + errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr)); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME, + pr->pr_hrtime_start); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME, + pr->pr_hrtime_sent); + if (errno != 0) + goto failed; + + if (pr->pr_status == PR_ACKED) { + recv_time = pr->pr_hrtime_ackrecv; + proc_time = pr->pr_hrtime_ackproc; + } + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time); + if (errno != 0) + goto failed; + + errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time); + if (errno != 0) + goto failed; + + if (tg != NULL) + addr2storage(pii->pii_af, &tg->tg_address, &ss); + else + addr2storage(pii->pii_af, &in6addr_any, &ss); + + errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss, + sizeof (ss)); + if (errno != 0) + goto failed; + + errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, + tg->tg_rtt_sa / 8); + if (errno != 0) + goto failed; + + errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, + tg->tg_rtt_sd / 4); + if (errno != 0) + goto failed; + + return (post_event(ESC_IPMP_PROBE_STATE, nvl)); +failed: + logperror("cannot create `probe state' event"); + nvlist_free(nvl); + return (-1); } /* @@ -2529,10 +3103,15 @@ gensig(void) unsigned int getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp) { - struct phyint_group *pg; struct phyint *pi; + struct phyint_group *pg; char (*ifs)[LIFNAMSIZ]; - unsigned int nif, i; + unsigned int i, j; + unsigned int nif = 0, naddr = 0; + lifgroupinfo_t lifgr; + addrlist_t *addrp; + struct sockaddr_storage *addrs; + int fdt = 0; pg = phyint_group_lookup(grname); if (pg == NULL) @@ -2540,39 +3119,143 @@ getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp) /* * Tally up the number of interfaces, allocate an array to hold them, - * and insert their names into the array. + * and insert their names into the array. While we're at it, if any + * interface is actually enabled to send probes, save the group fdt. */ - for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) + for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) nif++; ifs = alloca(nif * sizeof (*ifs)); for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) { assert(i < nif); (void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ); + if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) + fdt = pg->pg_fdt; } assert(i == nif); - *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, - groupstate(pg), nif, ifs); + /* + * If this is the anonymous group, there's no other information to + * collect (since there's no IPMP interface). + */ + if (pg == phyint_anongroup) { + *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt, + groupstate(pg), nif, ifs, "", "", "", "", 0, NULL); + return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); + } + + /* + * Grab some additional information about the group from the kernel. + * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name, + * we can use ifsock_v4 even for a V6-only group.) + */ + (void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ); + if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) { + if (errno == ENOENT) + return (IPMP_EUNKGROUP); + + logperror("getgroupinfo: SIOCGLIFGROUPINFO"); + return (IPMP_FAILURE); + } + + /* + * Tally up the number of data addresses, allocate an array to hold + * them, and insert their values into the array. + */ + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) + naddr++; + + addrs = alloca(naddr * sizeof (*addrs)); + i = 0; + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) { + /* + * It's possible to have duplicate addresses (if some are + * down). Weed the dups out to avoid confusing consumers. + * (If groups start having tons of addresses, we'll need a + * better algorithm here.) + */ + for (j = 0; j < i; j++) { + if (sockaddrcmp(&addrs[j], &addrp->al_addr)) + break; + } + if (j == i) { + assert(i < naddr); + addrs[i++] = addrp->al_addr; + } + } + naddr = i; + + *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt, + groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname, + lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs); return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); } /* + * Store the target information associated with phyint instance `pii' into a + * dynamically allocated structure pointed to by `*targinfopp'. Returns an + * IPMP error code. + */ +unsigned int +gettarginfo(struct phyint_instance *pii, const char *name, + ipmp_targinfo_t **targinfopp) +{ + uint_t ntarg = 0; + struct target *tg; + struct sockaddr_storage ss; + struct sockaddr_storage *targs = NULL; + + if (PROBE_CAPABLE(pii)) { + targs = alloca(pii->pii_ntargets * sizeof (*targs)); + tg = pii->pii_target_next; + do { + if (tg->tg_status == TG_ACTIVE) { + assert(ntarg < pii->pii_ntargets); + addr2storage(pii->pii_af, &tg->tg_address, + &targs[ntarg++]); + } + if ((tg = tg->tg_next) == NULL) + tg = pii->pii_targets; + } while (tg != pii->pii_target_next); + + assert(ntarg == pii->pii_ntargets); + } + + *targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss), + iftargmode(pii), ntarg, targs); + return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +} + +/* * Store the information associated with interface `ifname' into a dynamically * allocated structure pointed to by `*ifinfopp'. Returns an IPMP error code. */ unsigned int getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp) { + int retval; struct phyint *pi; + ipmp_targinfo_t *targinfo4; + ipmp_targinfo_t *targinfo6; pi = phyint_lookup(ifname); if (pi == NULL) return (IPMP_EUNKIF); + if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 || + (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0) + goto out; + *ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name, - ifstate(pi), iftype(pi)); - return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); + ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi), + ifflags(pi), targinfo4, targinfo6); + retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +out: + if (targinfo4 != NULL) + ipmp_freetarginfo(targinfo4); + if (targinfo6 != NULL) + ipmp_freetarginfo(targinfo6); + return (retval); } /* @@ -2605,6 +3288,54 @@ getgrouplist(ipmp_grouplist_t **grlistpp) } /* + * Store the address information for `ssp' (in group `grname') into a + * dynamically allocated structure pointed to by `*adinfopp'. Returns an IPMP + * error code. (We'd call this function getaddrinfo(), but it would conflict + * with getaddrinfo(3SOCKET)). + */ +unsigned int +getgraddrinfo(const char *grname, struct sockaddr_storage *ssp, + ipmp_addrinfo_t **adinfopp) +{ + int ifsock; + addrlist_t *addrp, *addrmatchp = NULL; + ipmp_addr_state_t state; + const char *binding = ""; + struct lifreq lifr; + struct phyint_group *pg; + + if ((pg = phyint_group_lookup(grname)) == NULL) + return (IPMP_EUNKADDR); + + /* + * Walk through the data addresses, and find a match. Note that since + * some of the addresses may be down, more than one may match. We + * prefer an up address (if one exists). + */ + for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) { + if (sockaddrcmp(ssp, &addrp->al_addr)) { + addrmatchp = addrp; + if (addrmatchp->al_flags & IFF_UP) + break; + } + } + + if (addrmatchp == NULL) + return (IPMP_EUNKADDR); + + state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN; + if (state == IPMP_ADDR_UP) { + ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6; + (void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ); + if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0) + binding = lifr.lifr_binding; + } + + *adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding); + return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); +} + +/* * Store a snapshot of the IPMP subsystem into a dynamically allocated * structure pointed to by `*snapp'. Returns an IPMP error code. */ @@ -2613,10 +3344,12 @@ getsnap(ipmp_snap_t **snapp) { ipmp_grouplist_t *grlistp; ipmp_groupinfo_t *grinfop; + ipmp_addrinfo_t *adinfop; + ipmp_addrlist_t *adlistp; ipmp_ifinfo_t *ifinfop; ipmp_snap_t *snap; struct phyint *pi; - unsigned int i; + unsigned int i, j; int retval; snap = ipmp_snap_create(); @@ -2627,26 +3360,37 @@ getsnap(ipmp_snap_t **snapp) * Add group list. */ retval = getgrouplist(&snap->sn_grlistp); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; /* - * Add information for each group in the list. + * Add information for each group in the list, along with all of its + * data addresses. */ grlistp = snap->sn_grlistp; for (i = 0; i < grlistp->gl_ngroup; i++) { retval = getgroupinfo(grlistp->gl_groups[i], &grinfop); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; + retval = ipmp_snap_addgroupinfo(snap, grinfop); if (retval != IPMP_SUCCESS) { ipmp_freegroupinfo(grinfop); - ipmp_snap_free(snap); - return (retval); + goto failed; + } + + adlistp = grinfop->gr_adlistp; + for (j = 0; j < adlistp->al_naddr; j++) { + retval = getgraddrinfo(grinfop->gr_name, + &adlistp->al_addrs[j], &adinfop); + if (retval != IPMP_SUCCESS) + goto failed; + + retval = ipmp_snap_addaddrinfo(snap, adinfop); + if (retval != IPMP_SUCCESS) { + ipmp_freeaddrinfo(adinfop); + goto failed; + } } } @@ -2655,18 +3399,19 @@ getsnap(ipmp_snap_t **snapp) */ for (pi = phyints; pi != NULL; pi = pi->pi_next) { retval = getifinfo(pi->pi_name, &ifinfop); - if (retval != IPMP_SUCCESS) { - ipmp_snap_free(snap); - return (retval); - } + if (retval != IPMP_SUCCESS) + goto failed; + retval = ipmp_snap_addifinfo(snap, ifinfop); if (retval != IPMP_SUCCESS) { ipmp_freeifinfo(ifinfop); - ipmp_snap_free(snap); - return (retval); + goto failed; } } *snapp = snap; return (IPMP_SUCCESS); +failed: + ipmp_snap_free(snap); + return (retval); } diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h index e4be3ccb30..39da2c3f1b 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _MPD_TABLES_H #define _MPD_TABLES_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -47,20 +45,11 @@ extern "C" { * switch AND * (ii) share the same phyint group name. * Load spreading and failover occur across members of the same phyint group. - * phyint group members must be homogenous. i.e. if a phyint belonging to a + * phyint group members must be homogeneous. i.e. if a phyint belonging to a * phyint group has a IPv6 protocol instance, then all members of the phyint * group, must have IPv6 protocol instances. (struct phyint_group) */ -/* - * Parameter passed to try_failover(), indicating the type of failover - * that is requested. - */ -#define FAILOVER_NORMAL 1 /* Failover to another phyint */ - /* that is preferably a standby */ -#define FAILOVER_TO_NONSTANDBY 2 /* Failover to non-standby phyint */ -#define FAILOVER_TO_ANY 3 /* Failover to any available phyint */ - #define MAXDEFERREDRTT 1 /* Maximum number of deferred rtts */ /* @@ -79,15 +68,9 @@ extern "C" { #define PI_IOCTL_ERROR 4 /* Some ioctl error */ #define PI_GROUP_CHANGED 5 /* The phyint has changed group. */ -/* - * Though IFF_POINTOPOINT is a logint property, for the purpose of - * failover, we treat it as a phyint property. Note that we cannot failover - * individual logints. - */ #define PHYINT_FLAGS(flags) \ - (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \ - IFF_POINTOPOINT | IFF_RUNNING)) | (handle_link_notifications ? \ - 0 : IFF_RUNNING)) + (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \ + IFF_RUNNING)) | (handle_link_notifications ? 0 : IFF_RUNNING)) /* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */ #define PHYINT_INSTANCE(pi, af) \ @@ -152,29 +135,32 @@ extern "C" { * Phyint group states; see below for the phyint group definition. */ enum pg_state { - PG_RUNNING = 1, /* at least one interface in group is working */ - PG_FAILED = 2 /* group has failed completely */ + PG_OK = 1, /* all interfaces in the group are working */ + PG_DEGRADED, /* some interfaces in the group are unusable */ + PG_FAILED /* all interfaces in the group are unusable */ }; /* * Convenience macro to check if the whole group has failed. */ -#define GROUP_FAILED(pg) ((pg)->pg_groupfailed) +#define GROUP_FAILED(pg) ((pg)->pg_state == PG_FAILED) /* * A doubly linked list of all phyint groups in the system. * A phyint group is identified by its group name. */ struct phyint_group { - char pg_name[LIFNAMSIZ + 1]; /* Phyint group name */ + char pg_name[LIFGRNAMSIZ]; /* Phyint group name */ struct phyint *pg_phyint; /* List of phyints in this group */ struct phyint_group *pg_next; /* Next phyint group */ struct phyint_group *pg_prev; /* Prev phyint group */ - uint64_t pg_sig; /* Current signature of this group */ - int pg_probeint; /* Interval between probes */ - int pg_fdt; /* Time needed to detect failure */ - uint_t - pg_groupfailed : 1; /* The whole group has failed */ + uint64_t pg_sig; /* Current signature of this group */ + int pg_probeint; /* Interval between probes */ + int pg_fdt; /* Time needed to detect failure */ + enum pg_state pg_state; /* Current group state */ + boolean_t pg_in_use; /* To detect removed groups */ + struct addrlist *pg_addrs; /* Data addresses in this group */ + boolean_t pg_failmsg_printed; /* Group failure msg printed */ }; /* @@ -207,6 +193,11 @@ struct phyint { uint16_t pi_icmpid; /* icmp id in icmp echo request */ uint64_t pi_taddrthresh; /* time (in secs) to delay logging */ /* about missing test addresses */ + dlpi_handle_t pi_dh; /* DLPI handle to underlying link */ + uint_t pi_notes; /* enabled DLPI notifications */ + uchar_t pi_hwaddr[DLPI_PHYSADDR_MAX]; /* phyint's hw address */ + size_t pi_hwaddrlen; /* phyint's hw address length */ + /* * The pi_whenup array is a circular buffer of the most recent * times (in milliseconds since some arbitrary point of time in @@ -217,14 +208,12 @@ struct phyint { unsigned int pi_whendx; uint_t - pi_empty : 1, /* failover done, empty */ - pi_full : 1, /* failback done, full */ - /* More details in probe.c */ pi_taddrmsg_printed : 1, /* testaddr msg printed */ pi_duptaddrmsg_printed : 1, /* dup testaddr msg printed */ pi_cfgmsg_printed : 1, /* bad config msg printed */ pi_lfmsg_printed : 1, /* link-flapping msg printed */ - pi_link_state : 1; /* interface link state */ + pi_link_state : 1, /* interface link state */ + pi_hwaddrdup : 1; /* disabled due to dup hw address */ }; /* @@ -260,19 +249,19 @@ struct phyint_instance { uint64_t pii_flags; /* Phyint flags from kernel */ struct probe_stats { - struct target *pr_target; /* Probe Target */ - uint_t pr_time_sent; /* Time probe was sent */ + uint_t pr_id; /* Full ID of probe */ + struct target *pr_target; /* Probe Target */ + uint_t pr_time_lost; /* Time probe declared lost */ + struct timeval pr_tv_sent; /* Wall time probe was sent */ + hrtime_t pr_hrtime_start; /* hrtime probe op started */ + hrtime_t pr_hrtime_sent; /* hrtime probe was sent */ + hrtime_t pr_hrtime_ackrecv; /* hrtime probe ack received */ + hrtime_t pr_hrtime_ackproc; /* hrtime probe ack processed */ uint_t pr_status; /* probe status as below */ #define PR_UNUSED 0 /* Probe slot unused */ #define PR_UNACKED 1 /* Probe is unacknowledged */ #define PR_ACKED 2 /* Probe has been acknowledged */ #define PR_LOST 3 /* Probe is declared lost */ - union { - uint_t tl; /* time probe is declared lost */ - uint_t ta; /* time probe is acked */ - } prt; -#define pr_time_lost prt.tl -#define pr_time_acked prt.ta } pii_probes[PROBE_STATS_COUNT]; uint_t @@ -319,7 +308,6 @@ struct logint { struct in6_addr li_subnet; /* prefix / subnet */ uint_t li_subnet_len; /* prefix / subnet length */ uint64_t li_flags; /* IFF_* flags */ - uint_t li_oifindex; /* original ifindex (SIOCGLIFOINDEX) */ uint_t li_in_use : 1, /* flag to detect deleted logints */ li_dupaddr : 1; /* test address is not unique */ @@ -345,12 +333,12 @@ struct target { #define TG_DEAD 4 /* Target is not responding */ hrtime_t tg_latime; /* Target's last active time */ - int tg_rtt_sa; /* Scaled round trip time(RTT) avg. */ - int tg_rtt_sd; /* Scaled RTT deviation */ - int tg_crtt; /* Conservative RTT = A + 4D */ + int64_t tg_rtt_sa; /* Scaled RTT average (in ns) */ + int64_t tg_rtt_sd; /* Scaled RTT deviation (in ns) */ + int tg_crtt; /* Conservative RTT = A + 4D (in ms) */ uint32_t tg_in_use : 1; /* In use flag */ - int tg_deferred[MAXDEFERREDRTT + 1]; + int64_t tg_deferred[MAXDEFERREDRTT + 1]; /* Deferred rtt data points */ int tg_num_deferred; /* Number of deferred rtt data points */ @@ -393,19 +381,20 @@ struct probe_success_count struct probes_missed { uint_t pm_nprobes; /* Cumulative number of missed probes */ - uint_t pm_ntimes; /* Total number of occassions */ + uint_t pm_ntimes; /* Total number of occasions */ }; -struct local_addr -{ - struct in6_addr addr; - struct local_addr *next; -}; +typedef struct addrlist { + struct addrlist *al_next; /* next address */ + char al_name[LIFNAMSIZ]; /* address lif name */ + uint64_t al_flags; /* address flags */ + struct sockaddr_storage al_addr; /* address */ +} addrlist_t; /* * Globals */ -extern struct local_addr *laddr_list; +extern addrlist_t *localaddrs; /* List of all local addresses, including local zones */ extern struct phyint *phyints; /* List of all phyints */ extern struct phyint_group *phyint_groups; /* List of all phyint groups */ @@ -428,10 +417,19 @@ extern void phyint_inst_delete(struct phyint_instance *pii); extern uint_t phyint_inst_timer(struct phyint_instance *pii); extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii); -extern void phyint_newtype(struct phyint *pi); +extern void phyint_changed(struct phyint *pi); extern void phyint_chstate(struct phyint *pi, enum pi_state state); extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state); +extern struct phyint_group *phyint_group_create(const char *pg_name); +extern struct phyint_group *phyint_group_lookup(const char *pg_name); +extern void phyint_group_insert(struct phyint_group *pg); +extern void phyint_group_delete(struct phyint_group *pg); +extern void phyint_group_refresh_state(struct phyint_group *pg); extern void phyint_check_for_repair(struct phyint *pi); +extern void phyint_transition_to_running(struct phyint *pi); +extern void phyint_activate_another(struct phyint *pi); +extern int phyint_offline(struct phyint *pi, unsigned int); +extern int phyint_undo_offline(struct phyint *pi); extern void logint_init_from_k(struct phyint_instance *pii, char *li_name); extern void logint_delete(struct logint *li); @@ -448,34 +446,40 @@ extern void target_add(struct phyint_instance *pii, struct in6_addr addr, extern void in_data(struct phyint_instance *pii); extern void in6_data(struct phyint_instance *pii); -extern int try_failover(struct phyint *pi, int failover_type); -extern int try_failback(struct phyint *pi); -extern int do_failback(struct phyint *pi); -extern boolean_t change_lif_flags(struct phyint *pi, uint64_t flags, - boolean_t setfl); - extern void logperror_pii(struct phyint_instance *pii, const char *str); extern void logperror_li(struct logint *li, const char *str); extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len); +extern void addr2storage(int af, const struct in6_addr *addr, + struct sockaddr_storage *ssp); extern void phyint_inst_print_all(void); +extern boolean_t prefix_equal(struct in6_addr, struct in6_addr, uint_t); -extern int logint_upcount(struct phyint *pi); -extern void restore_phyint(struct phyint *pi); extern void reset_crtt_all(struct phyint *pi); extern int failure_state(struct phyint_instance *pii); extern void process_link_state_changes(void); extern void clear_pii_probe_stats(struct phyint_instance *pii); extern void start_timer(struct phyint_instance *pii); +extern void stop_probing(struct phyint *pi); extern boolean_t own_address(struct in6_addr addr); +extern boolean_t change_pif_flags(struct phyint *pi, uint64_t set, + uint64_t clear); extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag); +extern int probe_state_event(struct probe_stats *, struct phyint_instance *); +extern void probe_chstate(struct probe_stats *, struct phyint_instance *, int); +extern unsigned int getgraddrinfo(const char *, struct sockaddr_storage *, + ipmp_addrinfo_t **); extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **); extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **); extern unsigned int getgrouplist(ipmp_grouplist_t **); extern unsigned int getsnap(ipmp_snap_t **); +extern boolean_t addrlist_add(addrlist_t **, const char *, uint64_t, + struct sockaddr_storage *); +extern void addrlist_free(addrlist_t **); + #ifdef __cplusplus } #endif diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c index 27716cabce..703ddcfaad 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/main.c @@ -17,14 +17,11 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "tables.h" #include <fcntl.h> @@ -122,7 +119,7 @@ sendpacket(struct sockaddr_in6 *sin6, int sock, int size, int flags) char abuf[INET6_ADDRSTRLEN]; cc = sendto(sock, (char *)packet, size, flags, - (struct sockaddr *)sin6, sizeof (*sin6)); + (struct sockaddr *)sin6, sizeof (*sin6)); if (cc < 0 || cc != size) { if (cc < 0) { logperror("sendpacket: sendto"); @@ -135,6 +132,32 @@ sendpacket(struct sockaddr_in6 *sin6, int sock, int size, int flags) } } +/* + * If possible, place an ND_OPT_SOURCE_LINKADDR option at `optp'. + * Return the number of bytes placed in the option. + */ +static uint_t +add_opt_lla(struct phyint *pi, struct nd_opt_lla *optp) +{ + uint_t optlen; + uint_t hwaddrlen; + struct lifreq lifr; + + /* If this phyint doesn't have a link-layer address, bail */ + if (phyint_get_lla(pi, &lifr) == -1) + return (0); + + hwaddrlen = lifr.lifr_nd.lnr_hdw_len; + /* roundup to multiple of 8 and make padding zero */ + optlen = ((sizeof (struct nd_opt_hdr) + hwaddrlen + 7) / 8) * 8; + bzero(optp, optlen); + optp->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; + optp->nd_opt_lla_len = optlen / 8; + bcopy(lifr.lifr_nd.lnr_hdw_addr, optp->nd_opt_lla_hdw_addr, hwaddrlen); + + return (optlen); +} + /* Send a Router Solicitation */ static void solicit(struct sockaddr_in6 *sin6, struct phyint *pi) @@ -151,24 +174,8 @@ solicit(struct sockaddr_in6 *sin6, struct phyint *pi) packetlen += sizeof (*rs); pptr += sizeof (*rs); - /* Attach any options */ - if (pi->pi_hdw_addr_len != 0) { - struct nd_opt_lla *lo = (struct nd_opt_lla *)pptr; - int optlen; - - /* roundup to multiple of 8 and make padding zero */ - optlen = ((sizeof (struct nd_opt_hdr) + - pi->pi_hdw_addr_len + 7) / 8) * 8; - bzero(pptr, optlen); - - lo->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; - lo->nd_opt_lla_len = optlen / 8; - bcopy((char *)pi->pi_hdw_addr, - (char *)lo->nd_opt_lla_hdw_addr, - pi->pi_hdw_addr_len); - packetlen += optlen; - pptr += optlen; - } + /* add options */ + packetlen += add_opt_lla(pi, (struct nd_opt_lla *)pptr); if (debug & D_PKTOUT) { print_route_sol("Sending solicitation to ", pi, rs, packetlen, @@ -224,24 +231,9 @@ advertise(struct sockaddr_in6 *sin6, struct phyint *pi, boolean_t no_prefixes) return; } - /* Attach any options */ - if (pi->pi_hdw_addr_len != 0) { - struct nd_opt_lla *lo = (struct nd_opt_lla *)pptr; - int optlen; - - /* roundup to multiple of 8 and make padding zero */ - optlen = ((sizeof (struct nd_opt_hdr) + - pi->pi_hdw_addr_len + 7) / 8) * 8; - bzero(pptr, optlen); - - lo->nd_opt_lla_type = ND_OPT_SOURCE_LINKADDR; - lo->nd_opt_lla_len = optlen / 8; - bcopy((char *)pi->pi_hdw_addr, - (char *)lo->nd_opt_lla_hdw_addr, - pi->pi_hdw_addr_len); - packetlen += optlen; - pptr += optlen; - } + /* add options */ + packetlen += add_opt_lla(pi, (struct nd_opt_lla *)pptr); + pptr = (char *)packet + packetlen; if (pi->pi_AdvLinkMTU != 0) { struct nd_opt_mtu *mo = (struct nd_opt_mtu *)pptr; @@ -1671,10 +1663,10 @@ process_rtsock(int rtsock) return; } - if (ifm->ifm_flags != pi->pi_flags) { + if (ifm->ifm_flags != (uint_t)pi->pi_flags) { if (debug & D_IFSCAN) { logmsg(LOG_DEBUG, "process_rtsock: clr for " - "%s old flags 0x%x new flags 0x%x\n", + "%s old flags 0x%llx new flags 0x%x\n", pi->pi_name, pi->pi_flags, ifm->ifm_flags); } } @@ -1825,141 +1817,67 @@ process_mibsock(int mibsock) } /* - * Check whether the address formed by pr->pr_prefix and pi_token - * exists in the kernel. Cannot call SIOCTMYADDR/ONLINK as it - * does not check for down addresses. This function should not - * be called for onlink prefixes. - */ -static boolean_t -is_address_present(struct phyint *pi, struct prefix *pr, uint64_t flags) -{ - int s; - in6_addr_t addr, *token; - int i; - int ret; - struct sockaddr_in6 sin6; - - s = socket(AF_INET6, SOCK_DGRAM, 0); - if (s < 0) { - logperror("is_address_present: socket"); - /* - * By returning B_TRUE, we make the caller delete - * the prefix from the internal table. In the worst - * case the next RA will create the prefix. - */ - return (_B_TRUE); - } - if (flags & IFF_TEMPORARY) - token = &pi->pi_tmp_token; - else - token = &pi->pi_token; - for (i = 0; i < 16; i++) { - /* - * prefix_create ensures that pr_prefix has all-zero - * bits after prefixlen. - */ - addr.s6_addr[i] = pr->pr_prefix.s6_addr[i] | token->s6_addr[i]; - } - (void) memset(&sin6, 0, sizeof (struct sockaddr_in6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_addr = addr; - ret = bind(s, (struct sockaddr *)&sin6, sizeof (struct sockaddr_in6)); - (void) close(s); - if (ret < 0 && errno == EADDRNOTAVAIL) - return (_B_FALSE); - else - return (_B_TRUE); -} - -/* * Look if the phyint or one of its prefixes have been removed from * the kernel and take appropriate action. - * Uses {pi,pr}_in_use. + * Uses pr_in_use and pi{,_kernel}_state. */ static void check_if_removed(struct phyint *pi) { - struct prefix *pr; - struct prefix *next_pr; + struct prefix *pr, *next_pr; /* - * Detect phyints that have been removed from the kernel. - * Since we can't recreate it here (would require ifconfig plumb - * logic) we just terminate use of that phyint. - */ - if (!(pi->pi_kernel_state & PI_PRESENT) && - (pi->pi_state & PI_PRESENT)) { - logmsg(LOG_ERR, "Interface %s has been removed from kernel. " - "in.ndpd will no longer use it\n", pi->pi_name); - /* - * Clear state so that should the phyint reappear - * we will start with initial advertisements or - * solicitations. - */ - phyint_cleanup(pi); - } - /* * Detect prefixes which are removed. - * - * We remove the prefix in all of the following cases : - * - * 1) Static prefixes are not the ones we create. So, - * just remove it from our tables. - * - * 2) On-link prefixes potentially move to a different - * phyint during failover. As it does not have - * an address, we can't use the logic in is_address_present - * to detect whether it is present in the kernel or not. - * Thus when it is manually removed we don't recreate it. - * - * 3) If there is a token mis-match and this prefix is not - * in the kernel, it means we don't need this prefix on - * this interface anymore. It must have been moved to a - * different interface by in.mpathd. This normally - * happens after a failover followed by a failback (or - * another failover) and we re-read the network - * configuration. For the failover from A to B, we would - * have created state on B about A's address, which will - * not be in use after the subsequent failback. So, we - * remove that prefix here. - * - * 4) If the physical interface is not present, then remove - * the prefix. In the cases where we are advertising - * prefixes, the state is kept in advertisement prefix and - * hence we can delete the prefix. - * - * 5) Similar to case (3), when we failover from A to B, the - * prefix in A will not be in use as it has been moved to B. - * We will delete it from our tables and recreate it when - * it fails back. is_address_present makes sure that the - * address is still valid in kernel. - * - * If none of the above is true, we recreate the prefix as it - * has been manually removed. We do it only when the interface - * is not FAILED or INACTIVE or OFFLINE. + * Static prefixes are just removed from our tables. + * Non-static prefixes are recreated i.e. in.ndpd takes precedence + * over manually removing prefixes via ifconfig. */ for (pr = pi->pi_prefix_list; pr != NULL; pr = next_pr) { next_pr = pr->pr_next; if (!pr->pr_in_use) { - /* Clear PR_AUTO and PR_ONLINK */ + /* Clear everything except PR_STATIC */ pr->pr_kernel_state &= PR_STATIC; - if ((pr->pr_state & PR_STATIC) || - !(pr->pr_state & PR_AUTO) || - !(prefix_token_match(pi, pr, pr->pr_flags)) || - (!(pi->pi_kernel_state & PI_PRESENT)) || - (is_address_present(pi, pr, pr->pr_flags))) { + pr->pr_name[0] = '\0'; + if (pr->pr_state & PR_STATIC) { prefix_delete(pr); - } else if (!(pi->pi_flags & - (IFF_FAILED|IFF_INACTIVE|IFF_OFFLINE)) && - pr->pr_state != pr->pr_kernel_state) { - pr->pr_name[0] = '\0'; + } else if (!(pi->pi_kernel_state & PI_PRESENT)) { + /* + * Ensure that there are no future attempts to + * run prefix_update_k since the phyint is gone. + */ + pr->pr_state = pr->pr_kernel_state; + } else if (pr->pr_state != pr->pr_kernel_state) { logmsg(LOG_INFO, "Prefix manually removed " - "on %s - recreating it!\n", - pi->pi_name); + "on %s; recreating\n", pi->pi_name); prefix_update_k(pr); } } } + + /* + * Detect phyints that have been removed from the kernel, and tear + * down any prefixes we created that are associated with that phyint. + * (NOTE: IPMP depends on in.ndpd tearing down these prefixes so an + * administrator can easily place an IP interface with ADDRCONF'd + * addresses into an IPMP group.) + */ + if (!(pi->pi_kernel_state & PI_PRESENT) && + (pi->pi_state & PI_PRESENT)) { + logmsg(LOG_ERR, "Interface %s has been removed from kernel. " + "in.ndpd will no longer use it\n", pi->pi_name); + + for (pr = pi->pi_prefix_list; pr != NULL; pr = next_pr) { + next_pr = pr->pr_next; + if (pr->pr_state & PR_AUTO) + prefix_delete(pr); + } + + /* + * Clear state so that should the phyint reappear we will + * start with initial advertisements or solicitations. + */ + phyint_cleanup(pi); + } } diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c index 5d64a9303d..0a9e1e6a13 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/ndp.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -383,29 +383,12 @@ incoming_ra(struct phyint *pi, struct nd_router_advert *ra, int len, if (no_loopback && loopback) return; - /* - * If the interface is FAILED or INACTIVE or OFFLINE, don't - * create any addresses on them. in.mpathd assumes that no new - * addresses will appear on these. This implies that we - * won't create any new prefixes advertised by the router - * on FAILED/INACTIVE/OFFLINE interfaces. When the state changes, - * the next RA will create the prefix on this interface. - */ - if (pi->pi_flags & (IFF_FAILED|IFF_INACTIVE|IFF_OFFLINE)) - return; + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - if (errno == ENXIO) - return; - logperror_pi(pi, "incoming_ra: SIOCGLIFLNKINFO"); - return; - } if (ra->nd_ra_curhoplimit != CURHOP_UNSPECIFIED && ra->nd_ra_curhoplimit != pi->pi_CurHopLimit) { pi->pi_CurHopLimit = ra->nd_ra_curhoplimit; - lifr.lifr_ifinfo.lir_maxhops = pi->pi_CurHopLimit; set_needed = _B_TRUE; } @@ -460,7 +443,7 @@ incoming_ra(struct phyint *pi, struct nd_router_advert *ra, int len, logmsg(LOG_DEBUG, "incoming_ra: trigger dhcp %s on %s\n", (ra->nd_ra_flags_reserved & ~pi->pi_ra_flags & - ND_RA_FLAG_MANAGED) ? "MANAGED" : "OTHER", + ND_RA_FLAG_MANAGED) ? "MANAGED" : "OTHER", pi->pi_name); } pi->pi_ra_flags |= ra->nd_ra_flags_reserved; @@ -999,11 +982,9 @@ incoming_prefix_addrconf_process(struct phyint *pi, struct prefix *pr, * Delete this prefix structure as kernel * does not allow duplicated addresses */ - logmsg(LOG_ERR, "incoming_prefix_addrconf_process: " - "Duplicate prefix %s received on interface %s\n", - inet_ntop(AF_INET6, - (void *)&po->nd_opt_pi_prefix, abuf, + "Duplicate prefix %s received on interface %s\n", + inet_ntop(AF_INET6, &po->nd_opt_pi_prefix, abuf, sizeof (abuf)), pi->pi_name); logmsg(LOG_ERR, "incoming_prefix_addrconf_process: " "Prefix already exists in interface %s\n", @@ -1129,12 +1110,8 @@ incoming_mtu_opt(struct phyint *pi, uchar_t *opt, } pi->pi_LinkMTU = mtu; - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - logperror_pi(pi, "incoming_mtu_opt: SIOCGLIFLNKINFO"); - return; - } + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); lifr.lifr_ifinfo.lir_maxmtu = pi->pi_LinkMTU; if (ioctl(pi->pi_sock, SIOCSLIFLNKINFO, (char *)&lifr) < 0) { logperror_pi(pi, "incoming_mtu_opt: SIOCSLIFLNKINFO"); @@ -1155,33 +1132,33 @@ incoming_lla_opt(struct phyint *pi, uchar_t *opt, struct sockaddr_in6 *sin6; int max_content_len; - if (pi->pi_hdw_addr_len == 0) + /* + * Get our link-layer address length. We may not have one, in which + * case we can just bail. + */ + if (phyint_get_lla(pi, &lifr) != 0) return; /* * Can't remove padding since it is link type specific. - * However, we check against the length of our link-layer - * address. - * Note: assumes that all links have a fixed lengh address. + * However, we check against the length of our link-layer address. + * Note: assumes that all links have a fixed length address. */ max_content_len = lo->nd_opt_lla_len * 8 - sizeof (struct nd_opt_hdr); - if (max_content_len < pi->pi_hdw_addr_len || + if (max_content_len < lifr.lifr_nd.lnr_hdw_len || (max_content_len >= 8 && - max_content_len - 7 > pi->pi_hdw_addr_len)) { + max_content_len - 7 > lifr.lifr_nd.lnr_hdw_len)) { char abuf[INET6_ADDRSTRLEN]; (void) inet_ntop(AF_INET6, (void *)&from->sin6_addr, abuf, sizeof (abuf)); logmsg(LOG_INFO, "lla option from %s on %s too long with bad " - "physaddr length (%d vs. %d bytes)\n", - abuf, pi->pi_name, - max_content_len, pi->pi_hdw_addr_len); + "physaddr length (%d vs. %d bytes)\n", abuf, pi->pi_name, + max_content_len, lifr.lifr_nd.lnr_hdw_len); return; } - lifr.lifr_nd.lnr_hdw_len = pi->pi_hdw_addr_len; - bcopy((char *)lo->nd_opt_lla_hdw_addr, - (char *)lifr.lifr_nd.lnr_hdw_addr, + bcopy(lo->nd_opt_lla_hdw_addr, lifr.lifr_nd.lnr_hdw_addr, lifr.lifr_nd.lnr_hdw_len); sin6 = (struct sockaddr_in6 *)&lifr.lifr_nd.lnr_addr; @@ -1196,8 +1173,7 @@ incoming_lla_opt(struct phyint *pi, uchar_t *opt, lifr.lifr_nd.lnr_state_same_lla = ND_UNCHANGED; lifr.lifr_nd.lnr_state_diff_lla = ND_STALE; lifr.lifr_nd.lnr_flags = isrouter; - (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); - lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; + (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); if (ioctl(pi->pi_sock, SIOCLIFSETND, (char *)&lifr) < 0) { logperror_pi(pi, "incoming_lla_opt: SIOCLIFSETND"); return; diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c index c8fc6381b7..09e6137965 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "tables.h" @@ -171,6 +169,7 @@ phyint_init_from_k(struct phyint *pi) struct ipv6_mreq v6mcastr; struct lifreq lifr; int fd; + int save_errno; boolean_t newsock; uint_t ttl; struct sockaddr_in6 *sin6; @@ -297,30 +296,6 @@ start_over: pi->pi_dst_token = in6addr_any; } - /* Get link-layer address */ - if (!(pi->pi_flags & IFF_MULTICAST) || - (pi->pi_flags & IFF_POINTOPOINT)) { - pi->pi_hdw_addr_len = 0; - } else { - sin6 = (struct sockaddr_in6 *)&lifr.lifr_nd.lnr_addr; - bzero(sin6, sizeof (struct sockaddr_in6)); - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = pi->pi_ifaddr; - - if (ioctl(fd, SIOCLIFGETND, (char *)&lifr) < 0) { - logperror_pi(pi, "phyint_init_from_k: SIOCLIFGETND"); - goto error; - } - - pi->pi_hdw_addr_len = lifr.lifr_nd.lnr_hdw_len; - - if (lifr.lifr_nd.lnr_hdw_len != 0) { - bcopy((char *)lifr.lifr_nd.lnr_hdw_addr, - (char *)pi->pi_hdw_addr, - lifr.lifr_nd.lnr_hdw_len); - } - } - if (newsock) { icmp6_filter_t filter; int on = 1; @@ -360,8 +335,21 @@ start_over: v6mcastr.ipv6mr_interface = pi->pi_index; if (setsockopt(fd, IPPROTO_IPV6, IPV6_JOIN_GROUP, (char *)&v6mcastr, sizeof (v6mcastr)) < 0) { - logperror_pi(pi, "phyint_init_from_k: " - "setsockopt IPV6_JOIN_GROUP"); + /* + * One benign reason IPV6_JOIN_GROUP could fail is + * when `pi' has been placed into an IPMP group and we + * haven't yet processed the routing socket message + * informing us of its disappearance. As such, if + * it's now in a group, don't print an error. + */ + save_errno = errno; + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1 || + lifr.lifr_groupname[0] == '\0') { + errno = save_errno; + logperror_pi(pi, "phyint_init_from_k: " + "setsockopt IPV6_JOIN_GROUP"); + } goto error; } pi->pi_state |= PI_JOINED_ALLNODES; @@ -403,8 +391,17 @@ start_over: v6mcastr.ipv6mr_interface = pi->pi_index; if (setsockopt(fd, IPPROTO_IPV6, IPV6_JOIN_GROUP, (char *)&v6mcastr, sizeof (v6mcastr)) < 0) { - logperror_pi(pi, "phyint_init_from_k: setsockopt " - "IPV6_JOIN_GROUP"); + /* + * See IPV6_JOIN_GROUP comment above. + */ + save_errno = errno; + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) == -1 || + lifr.lifr_groupname[0] == '\0') { + errno = save_errno; + logperror_pi(pi, "phyint_init_from_k: " + "setsockopt IPV6_JOIN_GROUP"); + } goto error; } pi->pi_state |= PI_JOINED_ALLROUTERS; @@ -569,22 +566,16 @@ phyint_print(struct phyint *pi) struct adv_prefix *adv_pr; struct router *dr; char abuf[INET6_ADDRSTRLEN]; - char llabuf[BUFSIZ]; logmsg(LOG_DEBUG, "Phyint %s index %d state %x, kernel %x, " "num routers %d\n", pi->pi_name, pi->pi_index, pi->pi_state, pi->pi_kernel_state, pi->pi_num_k_routers); - logmsg(LOG_DEBUG, "\taddress: %s flags %x\n", + logmsg(LOG_DEBUG, "\taddress: %s flags %llx\n", inet_ntop(AF_INET6, (void *)&pi->pi_ifaddr, abuf, sizeof (abuf)), pi->pi_flags); - logmsg(LOG_DEBUG, "\tsock %d mtu %d hdw_addr len %d <%s>\n", - pi->pi_sock, pi->pi_mtu, pi->pi_hdw_addr_len, - ((pi->pi_hdw_addr_len != 0) ? - fmt_lla(llabuf, sizeof (llabuf), pi->pi_hdw_addr, - pi->pi_hdw_addr_len) : "none")); - logmsg(LOG_DEBUG, "\ttoken: len %d %s\n", - pi->pi_token_length, + logmsg(LOG_DEBUG, "\tsock %d mtu %d\n", pi->pi_sock, pi->pi_mtu); + logmsg(LOG_DEBUG, "\ttoken: len %d %s\n", pi->pi_token_length, inet_ntop(AF_INET6, (void *)&pi->pi_token, abuf, sizeof (abuf))); if (pi->pi_TmpAddrsEnabled) { @@ -632,6 +623,43 @@ phyint_print(struct phyint *pi) logmsg(LOG_DEBUG, "\n"); } + +/* + * Store the LLA for the phyint `pi' `lifrp'. Returns 0 on success, or + * -1 on failure. + * + * Note that we do not cache the hardware address since there's no reliable + * mechanism to determine when it's become stale. + */ +int +phyint_get_lla(struct phyint *pi, struct lifreq *lifrp) +{ + struct sockaddr_in6 *sin6; + + /* If this phyint doesn't have a link-layer address, bail */ + if (!(pi->pi_flags & IFF_MULTICAST) || + (pi->pi_flags & IFF_POINTOPOINT)) { + return (-1); + } + + (void) strlcpy(lifrp->lifr_name, pi->pi_name, LIFNAMSIZ); + sin6 = (struct sockaddr_in6 *)&(lifrp->lifr_nd.lnr_addr); + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = pi->pi_ifaddr; + if (ioctl(pi->pi_sock, SIOCLIFGETND, lifrp) < 0) { + /* + * For IPMP interfaces, don't report ESRCH errors since that + * merely indicates that there are no active interfaces in the + * IPMP group (and thus there's no working hardware address), + * and the packet will thus never make it out anyway. + */ + if (!(pi->pi_flags & IFF_IPMP) || errno != ESRCH) + logperror_pi(pi, "phyint_get_lla: SIOCLIFGETND"); + return (-1); + } + return (0); +} + /* * Randomize pi->pi_ReachableTime. * Done periodically when there are no RAs and at a maximum frequency when @@ -642,20 +670,14 @@ phyint_print(struct phyint *pi) void phyint_reach_random(struct phyint *pi, boolean_t set_needed) { + struct lifreq lifr; + pi->pi_ReachableTime = GET_RANDOM( (int)(ND_MIN_RANDOM_FACTOR * pi->pi_BaseReachableTime), (int)(ND_MAX_RANDOM_FACTOR * pi->pi_BaseReachableTime)); if (set_needed) { - struct lifreq lifr; - - (void) strncpy(lifr.lifr_name, pi->pi_name, - sizeof (lifr.lifr_name)); - pi->pi_name[sizeof (pi->pi_name) - 1] = '\0'; - if (ioctl(pi->pi_sock, SIOCGLIFLNKINFO, (char *)&lifr) < 0) { - logperror_pi(pi, - "phyint_reach_random: SIOCGLIFLNKINFO"); - return; - } + bzero(&lifr, sizeof (lifr)); + (void) strlcpy(lifr.lifr_name, pi->pi_name, LIFNAMSIZ); lifr.lifr_ifinfo.lir_reachtime = pi->pi_ReachableTime; if (ioctl(pi->pi_sock, SIOCSLIFLNKINFO, (char *)&lifr) < 0) { logperror_pi(pi, @@ -1386,12 +1408,12 @@ prefix_modify_flags(struct prefix *pr, uint64_t onflags, uint64_t offflags) (void) strncpy(lifr.lifr_name, pr->pr_name, sizeof (lifr.lifr_name)); lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; if (ioctl(pi->pi_sock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { - logperror_pr(pr, "prefix_modify_flags: SIOCGLIFFLAGS"); - logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx " - "on 0x%llx off 0x%llx\n", - pr->pr_physical->pi_name, - pr->pr_name, - pr->pr_flags, onflags, offflags); + if (errno != ENXIO) { + logperror_pr(pr, "prefix_modify_flags: SIOCGLIFFLAGS"); + logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx" + " on 0x%llx off 0x%llx\n", pr->pr_physical->pi_name, + pr->pr_name, pr->pr_flags, onflags, offflags); + } return (-1); } old_flags = lifr.lifr_flags; @@ -1399,12 +1421,13 @@ prefix_modify_flags(struct prefix *pr, uint64_t onflags, uint64_t offflags) lifr.lifr_flags &= ~offflags; pr->pr_flags = lifr.lifr_flags; if (ioctl(pi->pi_sock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { - logperror_pr(pr, "prefix_modify_flags: SIOCSLIFFLAGS"); - logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx " - "new 0x%llx on 0x%llx off 0x%llx\n", - pr->pr_physical->pi_name, - pr->pr_name, - old_flags, lifr.lifr_flags, onflags, offflags); + if (errno != ENXIO) { + logperror_pr(pr, "prefix_modify_flags: SIOCSLIFFLAGS"); + logmsg(LOG_ERR, "prefix_modify_flags(%s, %s) old 0x%llx" + " new 0x%llx on 0x%llx off 0x%llx\n", + pr->pr_physical->pi_name, pr->pr_name, + old_flags, lifr.lifr_flags, onflags, offflags); + } return (-1); } return (0); @@ -1540,7 +1563,8 @@ prefix_update_k(struct prefix *pr) /* Remove logical interface based on pr_name */ lifr.lifr_addr.ss_family = AF_UNSPEC; - if (ioctl(pi->pi_sock, SIOCLIFREMOVEIF, (char *)&lifr) < 0) { + if (ioctl(pi->pi_sock, SIOCLIFREMOVEIF, (char *)&lifr) < 0 && + errno != ENXIO) { logperror_pr(pr, "prefix_update_k: SIOCLIFREMOVEIF"); } pr->pr_kernel_state = 0; @@ -1865,36 +1889,6 @@ prefix_print(struct prefix *pr) } /* - * Does the address formed by pr->pr_prefix and pi->pi_token match - * pr->pr_address. It does not match if a failover has happened - * earlier (done by in.mpathd) from a different pi. Should not - * be called for onlink prefixes. - */ -boolean_t -prefix_token_match(struct phyint *pi, struct prefix *pr, uint64_t flags) -{ - int i; - in6_addr_t addr, *token; - - if (flags & IFF_TEMPORARY) - token = &pi->pi_tmp_token; - else - token = &pi->pi_token; - for (i = 0; i < 16; i++) { - /* - * prefix_create ensures that pr_prefix has all-zero - * bits after prefixlen. - */ - addr.s6_addr[i] = pr->pr_prefix.s6_addr[i] | token->s6_addr[i]; - } - if (IN6_ARE_ADDR_EQUAL(&pr->pr_address, &addr)) { - return (_B_TRUE); - } else { - return (_B_FALSE); - } -} - -/* * Lookup advertisement prefix structure that matches the prefix and * prefix length. * Assumes that the bits after prefixlen might not be zero. @@ -2305,8 +2299,7 @@ phyint_print_all(void) } void -phyint_cleanup(pi) - struct phyint *pi; +phyint_cleanup(struct phyint *pi) { pi->pi_state = 0; pi->pi_kernel_state = 0; diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h index 409600a402..dfc5414d5d 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h +++ b/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/tables.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _NDPD_TABLES_H #define _NDPD_TABLES_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -58,9 +56,7 @@ struct phyint { char pi_name[LIFNAMSIZ]; /* Used to identify it */ int pi_sock; /* For sending and receiving */ struct in6_addr pi_ifaddr; /* Local address */ - uint_t pi_flags; /* IFF_* flags */ - uint_t pi_hdw_addr_len; - uchar_t pi_hdw_addr[ND_MAX_HDW_LEN]; + uint64_t pi_flags; /* IFF_* flags */ uint_t pi_mtu; /* From SIOCGLIFMTU */ struct in6_addr pi_token; uint_t pi_token_length; @@ -256,6 +252,7 @@ extern int phyint_init_from_k(struct phyint *pi); extern void phyint_delete(struct phyint *pi); extern uint_t phyint_timer(struct phyint *pi, uint_t elapsed); extern void phyint_print_all(void); +extern int phyint_get_lla(struct phyint *pi, struct lifreq *lifrp); extern void phyint_reach_random(struct phyint *pi, boolean_t set_needed); extern void phyint_cleanup(struct phyint *pi); @@ -280,8 +277,6 @@ extern void prefix_update_k(struct prefix *pr); extern uint_t prefix_timer(struct prefix *pr, uint_t elapsed); extern uint_t adv_prefix_timer(struct adv_prefix *adv_pr, uint_t elapsed); -extern boolean_t prefix_token_match(struct phyint *pi, - struct prefix *pr, uint64_t flags); extern struct prefix *prefix_lookup_addr(struct phyint *pi, struct in6_addr prefix); diff --git a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c index 15db1b7539..b76341e303 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c +++ b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c @@ -1,3 +1,7 @@ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ /* -*- Mode: C; tab-width: 4 -*- * * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. @@ -130,8 +134,6 @@ First checkin */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mDNSUNP.h" #include "mDNSDebug.h" @@ -398,13 +400,11 @@ select_src_ifi_info_solaris(int sockfd, int numifs, continue; /* * Avoid address if any of the following flags are set: - * IFF_NOFAILOVER: IPMP test address for use only by in.mpathd * IFF_NOXMIT: no packets transmitted over interface * IFF_NOLOCAL: no address * IFF_PRIVATE: is not advertised */ - if (ifflags & (IFF_NOFAILOVER | IFF_NOXMIT - | IFF_NOLOCAL | IFF_PRIVATE)) + if (ifflags & (IFF_NOXMIT | IFF_NOLOCAL | IFF_PRIVATE)) continue; if (*best_lifr != NULL) { diff --git a/usr/src/cmd/cmd-inet/usr.sbin/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/Makefile index d91d113347..e29c1765ec 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/Makefile @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -65,12 +65,13 @@ K5TELNETOBJS= in.telnetd.o SRCS= $(PROGSRCS) $(OTHERSRC) SUBDIRS= bootconfchk htable ifconfig in.ftpd in.rdisc in.routed \ - in.talkd inetadm inetconv ipqosconf kssl/kssladm kssl/ksslcfg \ - ping routeadm snoop sppptun traceroute wificonfig ipsecutils + in.talkd inetadm inetconv ipmpstat ipqosconf ipsecutils \ + kssl/kssladm kssl/ksslcfg ping routeadm snoop sppptun \ + traceroute wificonfig MSGSUBDIRS= bootconfchk htable ifconfig in.ftpd in.routed in.talkd inetadm \ - inetconv ipqosconf kssl/ksslcfg routeadm sppptun snoop \ - wificonfig ipsecutils + inetconv ipmpstat ipqosconf ipsecutils kssl/ksslcfg routeadm \ + sppptun snoop wificonfig # As programs get lint-clean, add them here and to the 'lint' target. # Eventually this hack should go away, and all in PROG should be @@ -83,7 +84,8 @@ LINTCLEAN= 6to4relay arp in.rlogind in.rshd in.telnetd in.tftpd \ # with SUBDIRS. Also (sigh) deal with the commented-out build lines # for the lint rule. LINTSUBDIRS= bootconfchk in.rdisc in.routed in.talkd inetadm inetconv \ - ipqosconf ping routeadm sppptun traceroute wificonfig ipsecutils + ipmpstat ipqosconf ipsecutils ping routeadm sppptun traceroute \ + wificonfig # And as programs are verified not to attempt to write into constants, # -xstrconst should be used to ensure they stay that way. CONSTCLEAN= @@ -144,6 +146,8 @@ LDLIBS += $(K5LIBS) $(TSNETPROG) := LDLIBS += -ltsnet in.rarpd := LDLIBS += -linetutil -ldlpi +if_mpadm := LDLIBS += -linetutil -lipmp +if_mpadm.po := XGETFLAGS += -a route := CPPFLAGS += -DNDEBUG ndd := LDLIBS += -ldladm gettable in.comsat := LDFLAGS += $(MAPFILE.NGB:%=-M%) @@ -245,7 +249,7 @@ lint: $(LINTSUBDIRS) -I$(SRC)/lib/gss_mechs/mech_krb5/include \ -I$(SRC)/lib/pam_modules/krb5 \ in.telnetd.c $(LDLIBS) -lbsm -lpam -lsocket -lnsl - $(LINT.c) if_mpadm.c $(LDLIBS) -lsocket -lnsl -lipmp + $(LINT.c) if_mpadm.c $(LDLIBS) -lsocket -lnsl -lipmp -linetutil $(LINT.c) ipaddrsel.c $(LDLIBS) -lsocket -lnsl $(LINT.c) route.c $(LDLIBS) -lsocket -lnsl -ltsnet $(LINT.c) syncinit.c $(LDLIBS) -ldlpi diff --git a/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c b/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c index d4874135fd..7c5d73c796 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/if_mpadm.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,660 +19,250 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <unistd.h> -#include <stdlib.h> +#include <errno.h> +#include <ipmp_admin.h> +#include <libinetutil.h> +#include <locale.h> +#include <net/if.h> +#include <stdarg.h> #include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> #include <sys/socket.h> -#include <netinet/in.h> -#include <netinet/tcp.h> #include <sys/sockio.h> -#include <net/if.h> -#include <errno.h> -#include <strings.h> -#include <ipmp_mpathd.h> -#include <libintl.h> +#include <sys/types.h> -static int if_down(int ifsock, struct lifreq *lifr); -static int if_up(int ifsock, struct lifreq *lifr); -static void send_cmd(int cmd, char *ifname); -static int connect_to_mpathd(sa_family_t family); -static void do_offline(char *ifname); -static void undo_offline(char *ifname); -static boolean_t offline_set(char *ifname); +typedef void offline_func_t(const char *, ipmp_handle_t); -#define IF_SEPARATOR ':' -#define MAX_RETRIES 3 +static const char *progname; +static int sioc4fd, sioc6fd; +static offline_func_t do_offline, undo_offline; +static boolean_t set_lifflags(const char *, uint64_t); +static boolean_t is_offline(const char *); +static void warn(const char *, ...); +static void die(const char *, ...); static void usage() { - (void) fprintf(stderr, "Usage : if_mpadm [-d | -r] <interface_name>\n"); + (void) fprintf(stderr, "Usage: %s [-d | -r] <interface>\n", progname); + exit(1); } -static void -print_mpathd_error_msg(uint32_t error) +static const char * +mpadm_errmsg(uint32_t error) { switch (error) { - case MPATHD_MIN_RED_ERROR: - (void) fprintf(stderr, gettext( - "Offline failed as there is no other functional " - "interface available in the multipathing group " - "for failing over the network access.\n")); - break; - - case MPATHD_FAILBACK_PARTIAL: - (void) fprintf(stderr, gettext( - "Offline cannot be undone because multipathing " - "configuration is not consistent across all the " - "interfaces in the group.\n")); - break; - + case IPMP_EUNKIF: + return ("not a physical interface or not in an IPMP group"); + case IPMP_EMINRED: + return ("no other functioning interfaces are in its IPMP " + "group"); default: - /* - * We shouldn't get here. All errors should have a - * meaningful error message, as shown in the above - * cases. If we get here, someone has made a mistake. - */ - (void) fprintf(stderr, gettext( - "Operation returned an unrecognized error: %u\n"), - error); - break; + return (ipmp_errmsg(error)); } } int main(int argc, char **argv) { - char *ifname; - int cmd = 0; + int retval; + ipmp_handle_t handle; + offline_func_t *ofuncp = NULL; + const char *ifname; int c; -#if !defined(TEXT_DOMAIN) -#define TEXT_DOMAIN "SYS_TEST" -#endif + if ((progname = strrchr(argv[0], '/')) != NULL) + progname++; + else + progname = argv[0]; + + (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); while ((c = getopt(argc, argv, "d:r:")) != EOF) { switch (c) { case 'd': ifname = optarg; - cmd = MI_OFFLINE; - if (offline_set(ifname)) { - (void) fprintf(stderr, gettext("Interface " - "already offlined\n")); - exit(1); - } + ofuncp = do_offline; break; case 'r': ifname = optarg; - cmd = MI_UNDO_OFFLINE; - if (!offline_set(ifname)) { - (void) fprintf(stderr, gettext("Interface not " - "offlined\n")); - exit(1); - } + ofuncp = undo_offline; break; default : usage(); - exit(1); } } - if (cmd == 0) { + if (ofuncp == NULL) usage(); - exit(1); - } /* - * Send the command to in.mpathd which is generic to - * both the commands. send_cmd returns only if there - * is no error. + * Create the global V4 and V6 socket ioctl descriptors. */ - send_cmd(cmd, ifname); - if (cmd == MI_OFFLINE) { - do_offline(ifname); - } else { - undo_offline(ifname); - } + sioc4fd = socket(AF_INET, SOCK_DGRAM, 0); + sioc6fd = socket(AF_INET6, SOCK_DGRAM, 0); + if (sioc4fd == -1 || sioc6fd == -1) + die("cannot create sockets"); - return (0); -} + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) + die("cannot create ipmp handle: %s\n", ipmp_errmsg(retval)); -/* - * Is IFF_OFFLINE set ? - * Returns B_FALSE on failure and B_TRUE on success. - */ -boolean_t -offline_set(char *ifname) -{ - struct lifreq lifr; - int s4; - int s6; - int ret; - - s4 = socket(AF_INET, SOCK_DGRAM, 0); - if (s4 < 0) { - perror("socket"); - exit(1); - } - s6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (s6 < 0) { - perror("socket"); - exit(1); - } - (void) strncpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); - ret = ioctl(s4, SIOCGLIFFLAGS, (caddr_t)&lifr); - if (ret < 0) { - if (errno != ENXIO) { - perror("ioctl: SIOCGLIFFLAGS"); - exit(1); - } - ret = ioctl(s6, SIOCGLIFFLAGS, (caddr_t)&lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - exit(1); - } - } - (void) close(s4); - (void) close(s6); - if (lifr.lifr_flags & IFF_OFFLINE) - return (B_TRUE); - else - return (B_FALSE); + (*ofuncp)(ifname, handle); + + ipmp_close(handle); + (void) close(sioc4fd); + (void) close(sioc6fd); + + return (EXIT_SUCCESS); } /* - * Sends the command to in.mpathd. If not successful, prints - * an error message and exits. + * Checks whether IFF_OFFLINE is set on `ifname'. */ -void -send_cmd(int cmd, char *ifname) +boolean_t +is_offline(const char *ifname) { - struct mi_offline mio; - struct mi_undo_offline miu; - struct mi_result me; - int ret; - int cmd_len; - int i; - int s; - - for (i = 0; i < MAX_RETRIES; i++) { - s = connect_to_mpathd(AF_INET); - if (s == -1) { - s = connect_to_mpathd(AF_INET6); - if (s == -1) { - (void) fprintf(stderr, gettext("Cannot " - "establish communication with " - "in.mpathd.\n")); - exit(1); - } - } - switch (cmd) { - case MI_OFFLINE : - cmd_len = sizeof (struct mi_offline); - bzero(&mio, cmd_len); - mio.mio_command = cmd; - (void) strncpy(mio.mio_ifname, ifname, LIFNAMSIZ); - mio.mio_min_redundancy = 1; - ret = write(s, &mio, cmd_len); - if (ret != cmd_len) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("write"); - (void) fprintf(stderr, gettext("Failed to " - "successfully send command to " - "in.mpathd.\n")); - exit(1); - } - break; - case MI_UNDO_OFFLINE: - cmd_len = sizeof (struct mi_undo_offline); - bzero(&miu, cmd_len); - miu.miu_command = cmd; - (void) strncpy(miu.miu_ifname, ifname, LIFNAMSIZ); - ret = write(s, &miu, cmd_len); - if (ret != cmd_len) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("write"); - (void) fprintf(stderr, gettext("Failed to " - "successfully send command to " - "in.mpathd.\n")); - exit(1); - } - break; - default : - (void) fprintf(stderr, "Unknown command \n"); - exit(1); - } + struct lifreq lifr = { 0 }; - /* Read the result from mpathd */ - ret = read(s, &me, sizeof (me)); - if (ret != sizeof (me)) { - /* errno is set only when ret is -1 */ - if (ret == -1) - perror("read"); - (void) fprintf(stderr, gettext("Failed to successfully " - "read result from in.mpathd.\n")); - exit(1); + (void) strlcpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); + if (ioctl(sioc4fd, SIOCGLIFFLAGS, &lifr) == -1) { + if (errno != ENXIO || + ioctl(sioc6fd, SIOCGLIFFLAGS, &lifr) == -1) { + die("cannot get interface flags on %s", ifname); } - if (me.me_mpathd_error == 0) { - if (i != 0) { - /* - * We retried at least once. Tell the user - * that things succeeded now. - */ - (void) fprintf(stderr, - gettext("Retry Successful.\n")); - } - return; /* Successful */ - } - - if (me.me_mpathd_error == MPATHD_SYS_ERROR) { - if (me.me_sys_error == EAGAIN) { - (void) close(s); - (void) sleep(1); - (void) fprintf(stderr, - gettext("Retrying ...\n")); - continue; /* Retry */ - } - errno = me.me_sys_error; - perror("if_mpadm"); - } else { - print_mpathd_error_msg(me.me_mpathd_error); - } - exit(1); } - /* - * We come here only if we retry the operation multiple - * times and did not succeed. Let the user try it again - * later. - */ - (void) fprintf(stderr, - gettext("Device busy. Retry the operation later.\n")); - exit(1); + + return ((lifr.lifr_flags & IFF_OFFLINE) != 0); } static void -do_offline(char *ifname) +do_offline(const char *ifname, ipmp_handle_t handle) { - struct lifreq lifr; - struct lifreq *lifcr; - struct lifnum lifn; - struct lifconf lifc; - char *buf; - int numifs; - int n; - char pi_name[LIFNAMSIZ + 1]; - char *cp; - int ifsock_v4; - int ifsock_v6; - int af; - int ret; + ifaddrlistx_t *ifaddrp, *ifaddrs; + int retval; + + if (is_offline(ifname)) + die("interface %s is already offline\n", ifname); + + if ((retval = ipmp_offline(handle, ifname, 1)) != IPMP_SUCCESS) + die("cannot offline %s: %s\n", ifname, mpadm_errmsg(retval)); /* - * Verify whether IFF_OFFLINE is not set as a sanity check. - */ - if (!offline_set(ifname)) { - (void) fprintf(stderr, gettext("Operation failed : in.mpathd " - "has not set IFF_OFFLINE on %s\n"), ifname); - exit(1); - } - /* - * Get both the sockets as we may need to bring both - * IPv4 and IPv6 interfaces down. - */ - ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); - if (ifsock_v4 < 0) { - perror("socket"); - exit(1); - } - ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (ifsock_v6 < 0) { - perror("socket"); - exit(1); - } - /* - * Get all the logicals for "ifname" and mark them down. - * There is no easy way of doing this. We get all the - * interfaces in the system using SICGLIFCONF and mark the - * ones matching the name down. + * Get all the up addresses for `ifname' and bring them down. */ - lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = 0; - if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - perror("ioctl : SIOCGLIFNUM"); - exit(1); - } - numifs = lifn.lifn_count; - - buf = calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { - perror("calloc"); - exit(1); - } + if (ifaddrlistx(ifname, IFF_UP, 0, &ifaddrs) == -1) + die("cannot get addresses on %s", ifname); - lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = 0; - lifc.lifc_len = numifs * sizeof (struct lifreq); - lifc.lifc_buf = buf; + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!(ifaddrp->ia_flags & IFF_OFFLINE)) + warn("IFF_OFFLINE vanished on %s\n", ifaddrp->ia_name); - if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - perror("ioctl : SIOCGLIFCONF"); - exit(1); + if (!set_lifflags(ifaddrp->ia_name, + ifaddrp->ia_flags & ~IFF_UP)) + warn("cannot bring down address on %s\n", + ifaddrp->ia_name); } - lifcr = (struct lifreq *)lifc.lifc_req; - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifcr++) { - af = lifcr->lifr_addr.ss_family; - (void) strncpy(pi_name, lifcr->lifr_name, - sizeof (pi_name)); - pi_name[sizeof (pi_name) - 1] = '\0'; - if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) - *cp = '\0'; - if (strcmp(pi_name, ifname) == 0) { - /* It matches the interface name that was offlined */ - (void) strncpy(lifr.lifr_name, lifcr->lifr_name, - sizeof (lifr.lifr_name)); - if (af == AF_INET) - ret = if_down(ifsock_v4, &lifr); - else - ret = if_down(ifsock_v6, &lifr); - if (ret != 0) { - (void) fprintf(stderr, gettext("Bringing down " - "the interfaces failed.\n")); - exit(1); - } - } - } + ifaddrlistx_free(ifaddrs); } static void -undo_offline(char *ifname) +undo_offline(const char *ifname, ipmp_handle_t handle) { - struct lifreq lifr; - struct lifreq *lifcr; - struct lifnum lifn; - struct lifconf lifc; - char *buf; - int numifs; - int n; - char pi_name[LIFNAMSIZ + 1]; - char *cp; - int ifsock_v4; - int ifsock_v6; - int af; - int ret; + ifaddrlistx_t *ifaddrp, *ifaddrs; + int retval; + + if (!is_offline(ifname)) + die("interface %s is not offline\n", ifname); /* - * Verify whether IFF_OFFLINE is set as a sanity check. - */ - if (offline_set(ifname)) { - (void) fprintf(stderr, gettext("Operation failed : in.mpathd " - "has not cleared IFF_OFFLINE on %s\n"), ifname); - exit(1); - } - /* - * Get both the sockets as we may need to bring both - * IPv4 and IPv6 interfaces UP. - */ - ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); - if (ifsock_v4 < 0) { - perror("socket"); - exit(1); - } - ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); - if (ifsock_v6 < 0) { - perror("socket"); - exit(1); - } - /* - * Get all the logicals for "ifname" and mark them up. - * There is no easy way of doing this. We get all the - * interfaces in the system using SICGLIFCONF and mark the - * ones matching the name up. + * Get all the down addresses for `ifname' and bring them up. */ - lifn.lifn_family = AF_UNSPEC; - lifn.lifn_flags = 0; - if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { - perror("ioctl : SIOCGLIFNUM"); - exit(1); - } - numifs = lifn.lifn_count; - - buf = calloc(numifs, sizeof (struct lifreq)); - if (buf == NULL) { - perror("calloc"); - exit(1); - } + if (ifaddrlistx(ifname, 0, IFF_UP, &ifaddrs) == -1) + die("cannot get addresses for %s", ifname); - lifc.lifc_family = AF_UNSPEC; - lifc.lifc_flags = 0; - lifc.lifc_len = numifs * sizeof (struct lifreq); - lifc.lifc_buf = buf; + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!(ifaddrp->ia_flags & IFF_OFFLINE)) + warn("IFF_OFFLINE vanished on %s\n", ifaddrp->ia_name); - if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { - perror("ioctl : SIOCGLIFCONF"); - exit(1); + if (!set_lifflags(ifaddrp->ia_name, ifaddrp->ia_flags | IFF_UP)) + warn("cannot bring up address on %s\n", + ifaddrp->ia_name); } - lifcr = (struct lifreq *)lifc.lifc_req; - for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifcr++) { - af = lifcr->lifr_addr.ss_family; - (void) strncpy(pi_name, lifcr->lifr_name, - sizeof (pi_name)); - pi_name[sizeof (pi_name) - 1] = '\0'; - if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) - *cp = '\0'; - - if (strcmp(pi_name, ifname) == 0) { - /* It matches the interface name that was offlined */ - (void) strncpy(lifr.lifr_name, lifcr->lifr_name, - sizeof (lifr.lifr_name)); - if (af == AF_INET) - ret = if_up(ifsock_v4, &lifr); - else - ret = if_up(ifsock_v6, &lifr); - if (ret != 0) { - (void) fprintf(stderr, gettext("Bringing up " - "the interfaces failed.\n")); - exit(1); - } - } - } -} + ifaddrlistx_free(ifaddrs); -/* - * Returns -1 on failure. Returns the socket file descriptor on - * success. - */ -static int -connect_to_mpathd(sa_family_t family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - perror("socket"); - return (-1); - } - bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; /* - * Need to bind to a privileged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from privileged ports succeed so that the ordinary user - * can't issue offline commands. + * Undo the offline. */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - perror("setsockopt : TCP_ANONPRIVBIND"); - exit(1); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - perror("bind"); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; + if ((retval = ipmp_undo_offline(handle, ifname)) != IPMP_SUCCESS) { + die("cannot undo-offline %s: %s\n", ifname, + mpadm_errmsg(retval)); } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - perror("connect"); - return (-1); - } - on = 0; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - perror("setsockopt : TCP_ANONPRIVBIND"); - return (-1); - } - return (s); + + /* + * Verify whether IFF_OFFLINE is set as a sanity check. + */ + if (is_offline(ifname)) + warn("in.mpathd has not cleared IFF_OFFLINE on %s\n", ifname); } /* - * Bring down the interface specified by the name lifr->lifr_name. - * - * Returns -1 on failure. Returns 0 on success. + * Change `lifname' to have `flags' set. Returns B_TRUE on success. */ -static int -if_down(int ifsock, struct lifreq *lifr) +static boolean_t +set_lifflags(const char *lifname, uint64_t flags) { - int ret; + struct lifreq lifr = { 0 }; + int fd = (flags & IFF_IPV4) ? sioc4fd : sioc6fd; - ret = ioctl(ifsock, SIOCGLIFFLAGS, (caddr_t)lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - return (-1); - } + (void) strlcpy(lifr.lifr_name, lifname, LIFNAMSIZ); + lifr.lifr_flags = flags; - /* IFF_OFFLINE was set to start with. Is it still there ? */ - if (!(lifr->lifr_flags & (IFF_OFFLINE))) { - (void) fprintf(stderr, gettext("IFF_OFFLINE disappeared on " - "%s\n"), lifr->lifr_name); - return (-1); - } - lifr->lifr_flags &= ~IFF_UP; - ret = ioctl(ifsock, SIOCSLIFFLAGS, (caddr_t)lifr); - if (ret < 0) { - perror("ioctl: SIOCSLIFFLAGS"); - return (-1); - } - return (0); + return (ioctl(fd, SIOCSLIFFLAGS, &lifr) >= 0); } -/* - * Bring up the interface specified by the name lifr->lifr_name. - * - * Returns -1 on failure. Returns 0 on success. - */ -static int -if_up(int ifsock, struct lifreq *lifr) +/* PRINTFLIKE1 */ +static void +die(const char *format, ...) { - int ret; - boolean_t zeroaddr = B_FALSE; - struct sockaddr_in *addr; - - ret = ioctl(ifsock, SIOCGLIFADDR, lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFADDR"); - return (-1); - } + va_list alist; + char *errstr = strerror(errno); - addr = (struct sockaddr_in *)&lifr->lifr_addr; - switch (addr->sin_family) { - case AF_INET: - zeroaddr = (addr->sin_addr.s_addr == INADDR_ANY); - break; + format = gettext(format); + (void) fprintf(stderr, gettext("%s: fatal: "), progname); - case AF_INET6: - zeroaddr = IN6_IS_ADDR_UNSPECIFIED( - &((struct sockaddr_in6 *)addr)->sin6_addr); - break; + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); - default: - break; - } + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", errstr); - ret = ioctl(ifsock, SIOCGLIFFLAGS, lifr); - if (ret < 0) { - perror("ioctl: SIOCGLIFFLAGS"); - return (-1); - } - /* - * Don't affect the state of addresses that failed back. - * - * XXX Link local addresses that are not marked IFF_NOFAILOVER - * will not be brought up. Link local addresses never failover. - * When the interface was offlined, we brought the link local - * address down. We will not bring it up now if IFF_NOFAILOVER - * is not marked. We check for IFF_NOFAILOVER below so that - * we want to maintain the state of all other addresses as it - * was before offline. Normally link local addresses are marked - * IFF_NOFAILOVER and hence this is not an issue. These can - * be fixed in future with RCM and it is beyond the scope - * of if_mpadm to maintain state and do this correctly. - */ - if (!(lifr->lifr_flags & IFF_NOFAILOVER)) - return (0); + exit(EXIT_FAILURE); +} - /* - * When a data address associated with the physical interface itself - * is failed over (e.g., qfe0, rather than qfe0:1), the kernel must - * fill the ipif data structure for qfe0 with a placeholder entry (the - * "replacement ipif"). Replacement ipif's cannot be brought IFF_UP - * (nor would it make any sense to do so), so we must be careful to - * skip them; thankfully they can be easily identified since they - * all have a zeroed address. - */ - if (zeroaddr) - return (0); - - /* IFF_OFFLINE was not set to start with. Is it there ? */ - if (lifr->lifr_flags & IFF_OFFLINE) { - (void) fprintf(stderr, - gettext("IFF_OFFLINE set wrongly on %s\n"), - lifr->lifr_name); - return (-1); - } - lifr->lifr_flags |= IFF_UP; - ret = ioctl(ifsock, SIOCSLIFFLAGS, lifr); - if (ret < 0) { - perror("ioctl: SIOCSLIFFLAGS"); - return (-1); - } - return (0); +/* PRINTFLIKE1 */ +static void +warn(const char *format, ...) +{ + va_list alist; + char *errstr = strerror(errno); + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", errstr); } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile index 69e91758ea..e99f2945a7 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile @@ -19,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# PROG = ifconfig ROOTFS_PROG = $(PROG) @@ -38,7 +37,7 @@ COMMONSRCS= $(CMDINETCOMMONDIR)/$(COMMONOBJS:%.o=%.c) SRCS= $(LOCALSRCS) $(COMMONSRCS) CPPFLAGS += -I$(CMDINETCOMMONDIR) -I$(SRC)/common/net/dhcp -LDLIBS += -ldhcpagent -linetcfg -ldlpi -ldladm +LDLIBS += -ldhcpagent -ldlpi -linetutil -linetcfg -lipmp -ldladm LINTFLAGS += -m ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%) diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h index c993baeb02..4aa1aa0ed7 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/defs.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -11,13 +11,12 @@ #ifndef _DEFS_H #define _DEFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif #include <errno.h> +#include <limits.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> @@ -54,7 +53,10 @@ extern "C" { #include <assert.h> #include <ipmp_mpathd.h> +#include <ipmp_admin.h> #include <inetcfg.h> +#include <libinetutil.h> +#include <alloca.h> #ifdef __cplusplus } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c index f49fca249c..d5517a4700 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -23,6 +23,7 @@ #define TUN_NAME "tun" #define ATUN_NAME "atun" #define TUN6TO4_NAME "6to4tun" +#define IPMPSTUB (void *)-1 typedef struct if_flags { uint64_t iff_value; @@ -67,7 +68,20 @@ static if_flags_t if_flags_tbl[] = { { IFF_TEMPORARY, "TEMPORARY" }, { IFF_FIXEDMTU, "FIXEDMTU" }, { IFF_VIRTUAL, "VIRTUAL" }, - { IFF_DUPLICATE, "DUPLICATE" } + { IFF_DUPLICATE, "DUPLICATE" }, + { IFF_IPMP, "IPMP"} +}; + +typedef struct { + const char *ia_app; + uint64_t ia_flag; + uint_t ia_tries; +} if_appflags_t; + +static const if_appflags_t if_appflags_tbl[] = { + { "dhcpagent(1M)", IFF_DHCPRUNNING, 1 }, + { "in.ndpd(1M)", IFF_ADDRCONF, 3 }, + { NULL, 0, 0 } }; static struct lifreq lifr; @@ -75,7 +89,6 @@ static struct lifreq lifr; static char name[LIFNAMSIZ]; /* foreach interface saved name */ static char origname[LIFNAMSIZ]; -static char savedname[LIFNAMSIZ]; /* For addif */ static int setaddr; /* @@ -89,20 +102,7 @@ static int setaddr; #define NO_ESP_AALG 256 #define NO_ESP_EALG 256 -/* - * iface_t - * used by setifether to create a list of interfaces to mark - * down-up when changing the ethernet address of an interface - */ -typedef struct iface { - struct lifreq lifr; - struct iface *next; /* pointer to the next list element */ -} iface_t; - -static iface_t *logifs = NULL; /* list of logical interfaces */ -static iface_t *phyif = NULL; /* physical interface */ - -int s; +int s, s4, s6; int af = AF_INET; /* default address family */ int debug = 0; int all = 0; /* setifdhcp() needs to know this */ @@ -113,6 +113,7 @@ int v4compat = 0; /* Compatible printing format */ * Function prototypes for command functions. */ static int addif(char *arg, int64_t param); +static int inetipmp(char *arg, int64_t param); static int inetplumb(char *arg, int64_t param); static int inetunplumb(char *arg, int64_t param); static int removeif(char *arg, int64_t param); @@ -141,7 +142,7 @@ static int modinsert(char *arg, int64_t param); static int modremove(char *arg, int64_t param); static int setifgroupname(char *arg, int64_t param); static int configinfo(char *arg, int64_t param); -static void print_config_flags(uint64_t flags); +static void print_config_flags(int af, uint64_t flags); static void print_flags(uint64_t flags); static void print_ifether(char *ifname); static int set_tun_encap_limit(char *arg, int64_t param); @@ -150,6 +151,7 @@ static int set_tun_hop_limit(char *arg, int64_t param); static int setzone(char *arg, int64_t param); static int setallzones(char *arg, int64_t param); static int setifsrc(char *arg, int64_t param); +static int lifnum(const char *ifname); /* * Address family specific function prototypes. @@ -179,19 +181,22 @@ static int settaddr(char *, int (*)(icfg_handle_t, static void status(void); static void ifstatus(const char *); static void usage(void); -static int strioctl(int s, int cmd, char *buf, int buflen); +static int strioctl(int s, int cmd, void *buf, int buflen); static int setifdhcp(const char *caller, const char *ifname, int argc, char *argv[]); static int ip_domux2fd(int *, int *, int *, int *, int *); static int ip_plink(int, int, int, int, int); static int modop(char *arg, char op); -static void selectifs(int argc, char *argv[], int af, - struct lifreq *lifrp); -static int updownifs(iface_t *ifs, int up); static int find_all_global_interfaces(struct lifconf *lifcp, char **buf, int64_t lifc_flags); static int find_all_zone_interfaces(struct lifconf *lifcp, char **buf, int64_t lifc_flags); +static int create_ipmp(const char *grname, int af, const char *ifname, + boolean_t implicit); +static int create_ipmp_peer(int af, const char *ifname); +static void start_ipmp_daemon(void); +static boolean_t ifaddr_up(ifaddrlistx_t *ifaddrp); +static boolean_t ifaddr_down(ifaddrlistx_t *ifaddrp); #define max(a, b) ((a) < (b) ? (b) : (a)) @@ -251,6 +256,7 @@ struct cmd { { "index", NEXTARG, setifindex, 0, AF_ANY }, { "broadcast", NEXTARG, setifbroadaddr, 0, AF_INET }, { "auto-revarp", 0, setifrevarp, 1, AF_INET }, + { "ipmp", 0, inetipmp, 1, AF_ANY }, { "plumb", 0, inetplumb, 1, AF_ANY }, { "unplumb", 0, inetunplumb, 0, AF_ANY }, { "subnet", NEXTARG, setifsubnet, 0, AF_ANY }, @@ -297,22 +303,30 @@ struct cmd { typedef struct if_config_cmd { uint64_t iff_flag; + int iff_af; char *iff_name; } if_config_cmd_t; +/* + * NOTE: print_config_flags() processes this table in order, so we put "up" + * last so that we can be sure "-failover" will take effect first. Otherwise, + * IPMP test addresses will erroneously migrate to the IPMP interface. + */ static if_config_cmd_t if_config_cmd_tbl[] = { - { IFF_UP, "up" }, - { IFF_NOTRAILERS, "-trailers" }, - { IFF_PRIVATE, "private" }, - { IFF_NOXMIT, "-xmit" }, - { IFF_ANYCAST, "anycast" }, - { IFF_NOLOCAL, "-local" }, - { IFF_DEPRECATED, "deprecated" }, - { IFF_NOFAILOVER, "-failover" }, - { IFF_STANDBY, "standby" }, - { IFF_FAILED, "failed" }, - { IFF_PREFERRED, "preferred" }, - { 0, 0 }, + { IFF_NOTRAILERS, AF_UNSPEC, "-trailers" }, + { IFF_PRIVATE, AF_UNSPEC, "private" }, + { IFF_NOXMIT, AF_UNSPEC, "-xmit" }, + { IFF_ANYCAST, AF_INET6, "anycast" }, + { IFF_NOLOCAL, AF_UNSPEC, "-local" }, + { IFF_DEPRECATED, AF_UNSPEC, "deprecated" }, + { IFF_NOFAILOVER, AF_UNSPEC, "-failover" }, + { IFF_STANDBY, AF_UNSPEC, "standby" }, + { IFF_FAILED, AF_UNSPEC, "failed" }, + { IFF_PREFERRED, AF_UNSPEC, "preferred" }, + { IFF_NONUD, AF_INET6, "-nud" }, + { IFF_NOARP, AF_INET, "-arp" }, + { IFF_UP, AF_UNSPEC, "up" }, + { 0, 0, NULL }, }; typedef struct ni { @@ -345,10 +359,11 @@ struct afswtch *afp; /* the address family being set or asked about */ int main(int argc, char *argv[]) { - /* Include IFF_NOXMIT, IFF_TEMPORARY and all zone interfaces */ - int64_t lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; + int64_t lifc_flags; char *default_ip_str; + lifc_flags = LIFC_NOXMIT|LIFC_TEMPORARY|LIFC_ALLZONES|LIFC_UNDER_IPMP; + if (argc < 2) { usage(); exit(1); @@ -388,9 +403,10 @@ main(int argc, char *argv[]) } s = socket(SOCKET_AF(af), SOCK_DGRAM, 0); - if (s < 0) { + s4 = socket(AF_INET, SOCK_DGRAM, 0); + s6 = socket(AF_INET6, SOCK_DGRAM, 0); + if (s == -1 || s4 == -1 || s6 == -1) Perror0_exit("socket"); - } /* * Special interface names is any combination of these flags. @@ -1441,39 +1457,38 @@ setifdstaddr(char *addr, int64_t param) static int setifflags(char *val, int64_t value) { - int phyintlen, origphyintlen; + struct lifreq lifrl; /* local lifreq struct */ + boolean_t bringup = _B_FALSE; (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) Perror0_exit("setifflags: SIOCGLIFFLAGS"); - if (value == IFF_NOFAILOVER) { - /* - * Fail if '-failover' is set after a prior addif created the - * alias on a different interface. This can happen when the - * interface is part of an IPMP group. - */ - phyintlen = strcspn(name, ":"); - origphyintlen = strcspn(origname, ":"); - if (phyintlen != origphyintlen || - strncmp(name, origname, phyintlen) != 0) { - (void) fprintf(stderr, "ifconfig: can't set -failover " - "on failed/standby/offlined interface %s\n", - origname); - exit(1); - } - } - if (value < 0) { value = -value; + + if ((value & IFF_NOFAILOVER) && (lifr.lifr_flags & IFF_UP)) { + /* + * The kernel does not allow administratively up test + * addresses to be converted to data addresses. Bring + * the address down first, then bring it up after it's + * been converted to a data address. + */ + lifr.lifr_flags &= ~IFF_UP; + (void) ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr); + bringup = _B_TRUE; + } + lifr.lifr_flags &= ~value; - if ((value & IFF_UP) && (lifr.lifr_flags & IFF_DUPLICATE)) { + if ((value & (IFF_UP | IFF_NOFAILOVER)) && + (lifr.lifr_flags & IFF_DUPLICATE)) { /* * If the user is trying to mark an interface with a - * duplicate address as "down," then fetch the address - * and set it. This will cause IP to clear the - * IFF_DUPLICATE flag and stop the automatic recovery - * timer. + * duplicate address as "down," or convert a duplicate + * test address to a data address, then fetch the + * address and set it. This will cause IP to clear + * the IFF_DUPLICATE flag and stop the automatic + * recovery timer. */ value = lifr.lifr_flags; if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) >= 0) @@ -1483,10 +1498,48 @@ setifflags(char *val, int64_t value) } else { lifr.lifr_flags |= value; } + + /* + * If we're about to bring up an underlying physical IPv6 interface in + * an IPMP group, ensure the IPv6 IPMP interface is also up. This is + * for backward compatibility with legacy configurations in which + * there are no explicit hostname files for IPMP interfaces. (For + * IPv4, this is automatically handled by the kernel when migrating + * the underlying interface's data address to the IPMP interface.) + */ + (void) strlcpy(lifrl.lifr_name, name, LIFNAMSIZ); + + if (lifnum(lifr.lifr_name) == 0 && + (lifr.lifr_flags & (IFF_UP|IFF_IPV6)) == (IFF_UP|IFF_IPV6) && + ioctl(s, SIOCGLIFGROUPNAME, &lifrl) == 0 && + lifrl.lifr_groupname[0] != '\0') { + lifgroupinfo_t lifgr; + + (void) strlcpy(lifgr.gi_grname, lifrl.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) + Perror0_exit("setifflags: SIOCGLIFGROUPINFO"); + + (void) strlcpy(lifrl.lifr_name, lifgr.gi_grifname, LIFNAMSIZ); + if (ioctl(s, SIOCGLIFFLAGS, &lifrl) == -1) + Perror0_exit("setifflags: SIOCGLIFFLAGS"); + if (!(lifrl.lifr_flags & IFF_UP)) { + lifrl.lifr_flags |= IFF_UP; + if (ioctl(s, SIOCSLIFFLAGS, &lifrl) == -1) + Perror0_exit("setifflags: SIOCSLIFFLAGS"); + } + } + (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) { + if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) Perror0_exit("setifflags: SIOCSLIFFLAGS"); + + if (bringup) { + lifr.lifr_flags |= IFF_UP; + if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) + Perror0_exit("setifflags: SIOCSLIFFLAGS IFF_UP"); } + return (0); } @@ -1524,12 +1577,21 @@ setifindex(char *val, int64_t param) } /* ARGSUSED */ +static void +notifycb(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg) +{ +} + +/* ARGSUSED */ static int setifether(char *addr, int64_t param) { - uchar_t *ea; - iface_t *current; - int maclen; + uchar_t *hwaddr; + int hwaddrlen; + int retval; + ifaddrlistx_t *ifaddrp, *ifaddrs = NULL; + dlpi_handle_t dh; + dlpi_notifyid_t id; if (addr == NULL) { ifstatus(name); @@ -1537,9 +1599,6 @@ setifether(char *addr, int64_t param) return (0); } - phyif = NULL; - logifs = NULL; - /* * if the IP interface in the arguments is a logical * interface, exit with an error now. @@ -1550,79 +1609,68 @@ setifether(char *addr, int64_t param) exit(1); } - ea = _link_aton(addr, &maclen); - if (ea == NULL) { - if (maclen == -1) + if ((hwaddr = _link_aton(addr, &hwaddrlen)) == NULL) { + if (hwaddrlen == -1) (void) fprintf(stderr, - "ifconfig: %s: bad address\n", addr); + "ifconfig: %s: bad address\n", hwaddr); else (void) fprintf(stderr, "ifconfig: malloc() failed\n"); exit(1); } - (void) strncpy(savedname, name, sizeof (savedname)); + if ((retval = dlpi_open(name, &dh, 0)) != DLPI_SUCCESS) + Perrdlpi_exit("cannot dlpi_open() link", name, retval); - /* - * Call selectifs only for the IP interfaces that are ipv4. - * offflags == IFF_IPV6 because you should not change the - * Ethernet address of an ipv6 interface - */ - foreachinterface(selectifs, 0, (char **)NULL, 0, 0, IFF_IPV6, 0); + if ((retval = dlpi_bind(dh, DLPI_ANY_SAP, NULL)) != DLPI_SUCCESS) + Perrdlpi_exit("cannot dlpi_bind() link", name, retval); - /* If physical interface not found, exit now */ - if (phyif == NULL) { - (void) fprintf(stderr, - "ifconfig: interface %s not found\n", savedname); - exit(1); - } - - /* Restore */ - (void) strncpy(name, savedname, sizeof (name)); - (void) strncpy(origname, savedname, sizeof (origname)); - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - - /* - * close and reopen the socket - * we don't know which type of socket we have now - */ - (void) close(s); - s = socket(SOCKET_AF(AF_UNSPEC), SOCK_DGRAM, 0); - if (s < 0) { - Perror0_exit("socket"); - } - - /* - * mark down the logical interfaces first, - * and then the physical interface - */ - if (updownifs(logifs, 0) < 0 || updownifs(phyif, 0) < 0) { - Perror0_exit("mark down interface failed"); + retval = dlpi_enabnotify(dh, DL_NOTE_PHYS_ADDR, notifycb, NULL, &id); + if (retval == DLPI_SUCCESS) { + (void) dlpi_disabnotify(dh, id, NULL); + } else { + /* + * This link does not support DL_NOTE_PHYS_ADDR: bring down + * all of the addresses to flush the old hardware address + * information out of IP. + * + * NOTE: Skipping this when DL_NOTE_PHYS_ADDR is supported is + * more than an optimization: in.mpathd will set IFF_OFFLINE + * if it's notified and the new address is a duplicate of + * another in the group -- but the flags manipulation in + * ifaddr_{down,up}() cannot be atomic and thus might clobber + * IFF_OFFLINE, confusing in.mpathd. + */ + if (ifaddrlistx(name, IFF_UP, 0, &ifaddrs) == -1) + Perror2_exit(name, "cannot get address list"); + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_down(ifaddrp)) { + Perror2_exit(ifaddrp->ia_name, + "cannot bring down"); + } + } } /* - * Change the physical address + * Change the hardware address. */ - if (dlpi_set_address(savedname, ea, maclen) == -1) { + retval = dlpi_set_physaddr(dh, DL_CURR_PHYS_ADDR, hwaddr, hwaddrlen); + if (retval != DLPI_SUCCESS) { (void) fprintf(stderr, - "ifconfig: failed setting mac address on %s\n", - savedname); + "ifconfig: failed setting mac address on %s\n", name); } + dlpi_close(dh); /* - * if any interfaces were marked down before changing the - * ethernet address, put them up again. - * First the physical interface, then the logical ones. + * If any addresses were brought down before changing the hardware + * address, bring them up again. */ - if (updownifs(phyif, 1) < 0 || updownifs(logifs, 1) < 0) { - Perror0_exit("mark down interface failed"); - } - - /* Free the memory allocated by selectifs */ - free(phyif); - for (current = logifs; current != NULL; current = logifs) { - logifs = logifs->next; - free(current); + for (ifaddrp = ifaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp)) + Perror2_exit(ifaddrp->ia_name, "cannot bring up"); } + ifaddrlistx_free(ifaddrs); return (0); } @@ -1655,8 +1703,8 @@ print_ifether(char *ifname) } (void) close(fd); - /* Virtual interfaces don't have MAC addresses */ - if (lifr.lifr_flags & IFF_VIRTUAL) + /* VNI and IPMP interfaces don't have MAC addresses */ + if (lifr.lifr_flags & (IFF_VIRTUAL|IFF_IPMP)) return; /* @@ -1685,104 +1733,6 @@ print_ifether(char *ifname) } /* - * static void selectifs(int argc, char *argv[], int af, struct lifreq *rp) - * - * Called inside setifether() to create a list of interfaces to - * mark down/up when changing the Ethernet address. - * If the current interface is the physical interface passed - * as an argument to ifconfig, update phyif. - * If the current interface is a logical interface associated - * to the physical interface, add it to the logifs list. - */ -/* ARGSUSED */ -static void -selectifs(int argc, char *argv[], int af, struct lifreq *rp) -{ - char *colonp; - int length; - iface_t *current; - - /* - * savedname= name of the IP interface to which you want to - * change ethernet address - * name= name of the current IP interface - */ - colonp = strchr(name, ':'); - if (colonp == NULL) - length = max(strlen(savedname), strlen(name)); - else - length = max(strlen(savedname), colonp - name); - if (strncmp(savedname, name, length) == 0) { - (void) strcpy(lifr.lifr_name, name); - if (ioctl(s, SIOCGLIFFLAGS, &lifr) < 0) { - Perror0("selectifs: SIOCGLIFFLAGS"); - return; - } - - if ((current = malloc(sizeof (iface_t))) == NULL) { - Perror0_exit("selectifs: malloc failed\n"); - } - - if (colonp == NULL) { - /* this is the physical interface */ - phyif = current; - bcopy(&lifr, &phyif->lifr, sizeof (struct lifreq)); - phyif->next = NULL; - } else { - /* this is a logical interface */ - bcopy(&lifr, ¤t->lifr, sizeof (struct lifreq)); - current->next = logifs; - logifs = current; - } - } -} - -/* - * static int updownifs(iface_t *ifs, int up) - * - * It takes in input a list of IP interfaces (ifs) - * and a flag (up). - * It marks each interface in the list down (up = 0) - * or up (up > 0). This is done ONLY if the IP - * interface was originally up. - * - * Return values: - * 0 = everything OK - * -1 = problem - */ -static int -updownifs(iface_t *ifs, int up) -{ - iface_t *current; - int ret = 0; - int save_errno; - char savename[LIFNAMSIZ]; - uint64_t orig_flags; - - for (current = ifs; current != NULL; current = current->next) { - if (current->lifr.lifr_flags & IFF_UP) { - orig_flags = current->lifr.lifr_flags; - if (!up) - current->lifr.lifr_flags &= ~IFF_UP; - if (ioctl(s, SIOCSLIFFLAGS, ¤t->lifr) < 0) { - save_errno = errno; - (void) strcpy(savename, - current->lifr.lifr_name); - ret = -1; - } - if (!up) /* restore the original flags */ - current->lifr.lifr_flags = orig_flags; - } - } - - if (ret == -1) { - (void) strcpy(lifr.lifr_name, savename); - errno = save_errno; - } - return (ret); -} - -/* * static int find_all_global_interfaces(struct lifconf *lifcp, char **buf, * int64_t lifc_flags) * @@ -2109,130 +2059,217 @@ setiftoken(char *addr, int64_t param) return (0); } -/* - * Return value: 0 on success, -1 on failure. - */ -static int -connect_to_mpathd(int family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - Perror0_exit("connect_to_mpathd: socket"); - } - (void) bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; - /* - * Need to bind to a privileged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from privileged ports succeed so that ordinary users - * can't connect and start talking to in.mpathd - */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - Perror0_exit("connect_to_mpathd: setsockopt"); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - (void) close(s); - return (-1); - } - - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - (void) close(s); - return (ret); -} - /* ARGSUSED */ static int -setifgroupname(char *grpname, int64_t param) +setifgroupname(char *grname, int64_t param) { + lifgroupinfo_t lifgr; + struct lifreq lifrl; + ifaddrlistx_t *ifaddrp, *nextifaddrp; + ifaddrlistx_t *ifaddrs = NULL, *downaddrs = NULL; + int af; + if (debug) { (void) printf("Setting groupname %s on interface %s\n", - grpname, name); - } - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); - (void) strncpy(lifr.lifr_groupname, grpname, - sizeof (lifr.lifr_groupname)); - if (ioctl(s, SIOCSLIFGROUPNAME, (caddr_t)&lifr) < 0) { - Perror0_exit("setifgroupname: SIOCSLIFGROUPNAME"); + grname, name); } - /* - * If the SUNW_NO_MPATHD environment variable is set then don't - * bother starting up in.mpathd. See PSARC/2002/249 for the - * depressing details on this bit of stupidity. - */ - if (getenv("SUNW_NO_MPATHD") != NULL) { - return (0); + (void) strlcpy(lifrl.lifr_name, name, LIFNAMSIZ); + (void) strlcpy(lifrl.lifr_groupname, grname, LIFGRNAMSIZ); + + while (ioctl(s, SIOCSLIFGROUPNAME, &lifrl) == -1) { + switch (errno) { + case ENOENT: + /* + * The group doesn't yet exist; create it and repeat. + */ + af = afp->af_af; + if (create_ipmp(grname, af, NULL, _B_TRUE) == -1) { + if (errno == EEXIST) + continue; + + Perror2(grname, "cannot create IPMP group"); + goto fail; + } + continue; + + case EALREADY: + /* + * The interface is already in another group; must + * remove existing membership first. + */ + lifrl.lifr_groupname[0] = '\0'; + if (ioctl(s, SIOCSLIFGROUPNAME, &lifrl) == -1) { + Perror2(name, "cannot remove existing " + "IPMP group membership"); + goto fail; + } + (void) strlcpy(lifrl.lifr_groupname, grname, + LIFGRNAMSIZ); + continue; + + case EAFNOSUPPORT: + /* + * The group exists, but it's not configured with the + * address families the interface needs. Since only + * two address families are currently supported, just + * configure the "other" address family. Note that we + * may race with group deletion or creation by another + * process (ENOENT or EEXIST); in such cases we repeat + * our original SIOCSLIFGROUPNAME. + */ + (void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) { + if (errno == ENOENT) + continue; + + Perror2(grname, "SIOCGLIFGROUPINFO"); + goto fail; + } + + af = lifgr.gi_v4 ? AF_INET6 : AF_INET; + if (create_ipmp(grname, af, lifgr.gi_grifname, + _B_TRUE) == -1) { + if (errno == EEXIST) + continue; + + Perror2(grname, "cannot configure IPMP group"); + goto fail; + } + continue; + + case EADDRINUSE: + /* + * Some addresses are in-use (or under control of DAD). + * Bring them down and retry the group join operation. + * We will bring them back up after the interface has + * been placed in the group. + */ + if (ifaddrlistx(lifrl.lifr_name, IFF_UP|IFF_DUPLICATE, + 0, &ifaddrs) == -1) { + Perror2(grname, "cannot get address list"); + goto fail; + } + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = nextifaddrp) { + if (!ifaddr_down(ifaddrp)) { + ifaddrs = ifaddrp; + goto fail; + } + nextifaddrp = ifaddrp->ia_next; + ifaddrp->ia_next = downaddrs; + downaddrs = ifaddrp; + } + ifaddrs = NULL; + continue; + + case EADDRNOTAVAIL: { + /* + * Some data addresses are under application control. + * For some of these (e.g., ADDRCONF), the application + * should remove the address, in which case we retry a + * few times (since the application's action is not + * atomic with respect to us) before bailing out and + * informing the user. + */ + int ntries, nappaddr = 0; + const if_appflags_t *iap = if_appflags_tbl; + + for (; iap->ia_app != NULL; iap++) { + ntries = 0; +again: + if (ifaddrlistx(lifrl.lifr_name, iap->ia_flag, + IFF_NOFAILOVER, &ifaddrs) == -1) { + (void) fprintf(stderr, "ifconfig: %s: " + "cannot get data addresses managed " + "by %s\n", lifrl.lifr_name, + iap->ia_app); + goto fail; + } + + if (ifaddrs == NULL) + continue; + + ifaddrlistx_free(ifaddrs); + ifaddrs = NULL; + + if (++ntries < iap->ia_tries) { + (void) poll(NULL, 0, 100); + goto again; + } + + (void) fprintf(stderr, "ifconfig: cannot join " + "IPMP group: %s has data addresses managed " + "by %s\n", lifrl.lifr_name, iap->ia_app); + nappaddr++; + } + if (nappaddr > 0) + goto fail; + continue; + } + default: + Perror2(name, "SIOCSLIFGROUPNAME"); + goto fail; + } } /* - * Try to connect to in.mpathd using IPv4. If we succeed, - * we conclude that in.mpathd is running, and quit. + * If there were addresses that we had to bring down, it's time to + * bring them up again. As part of bringing them up, the kernel will + * automatically move them to the new IPMP interface. */ - if (connect_to_mpathd(AF_INET) == 0) { - /* connect succeeded, mpathd is already running */ - return (0); + for (ifaddrp = downaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp) && errno != ENXIO) { + (void) fprintf(stderr, "ifconfig: cannot bring back up " + "%s: %s\n", ifaddrp->ia_name, strerror(errno)); + } } + ifaddrlistx_free(downaddrs); + return (0); +fail: /* - * Try to connect to in.mpathd using IPv6. If we succeed, - * we conclude that in.mpathd is running, and quit. + * Attempt to bring back up any interfaces that we downed. */ - if (connect_to_mpathd(AF_INET6) == 0) { - /* connect succeeded, mpathd is already running */ - return (0); + for (ifaddrp = downaddrs; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (!ifaddr_up(ifaddrp) && errno != ENXIO) { + (void) fprintf(stderr, "ifconfig: cannot bring back up " + "%s: %s\n", ifaddrp->ia_name, strerror(errno)); + } } + ifaddrlistx_free(downaddrs); + ifaddrlistx_free(ifaddrs); /* - * in.mpathd may not be running. Start it now. If it is already - * running, in.mpathd will take care of handling multiple incarnations - * of itself. ifconfig only tries to optimize performance by not - * starting another incarnation of in.mpathd. + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). */ - switch (fork()) { + exit(1); + /* NOTREACHED */ +} - case -1: - Perror0_exit("setifgroupname: fork"); - /* NOTREACHED */ - case 0: - (void) execl(MPATHD_PATH, MPATHD_PATH, NULL); - _exit(1); - /* NOTREACHED */ - default: - return (0); +static boolean_t +modcheck(const char *ifname) +{ + (void) strlcpy(lifr.lifr_name, ifname, sizeof (lifr.lifr_name)); + + if (ioctl(s, SIOCGLIFFLAGS, &lifr) < 0) { + Perror0("SIOCGLIFFLAGS"); + return (_B_FALSE); } -} + if (lifr.lifr_flags & IFF_IPMP) { + (void) fprintf(stderr, "ifconfig: %s: module operations not" + " supported on IPMP interfaces\n", ifname); + return (_B_FALSE); + } + if (lifr.lifr_flags & IFF_VIRTUAL) { + (void) fprintf(stderr, "ifconfig: %s: module operations not" + " supported on virtual IP interfaces\n", ifname); + return (_B_FALSE); + } + return (_B_TRUE); +} /* * To list all the modules above a given network interface. @@ -2250,7 +2287,13 @@ modlist(char *null, int64_t param) struct str_list strlist; int orig_arpid; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (!modcheck(name)) + exit(1); + if (ip_domux2fd(&muxfd, &muxid_fd, &ipfd_lowstr, &arpfd_lowstr, &orig_arpid) < 0) { return (-1); @@ -2354,8 +2397,8 @@ open_arp_on_udp(char *udp_dev_name) * Return: * -1 if operation fails, 0 otherwise. * - * Please see the big block comment above plumb_one_device() - * for the logic of the PLINK/PUNLINK + * Please see the big block comment above ifplumb() for the logic of the + * PLINK/PUNLINK */ static int ip_domux2fd(int *muxfd, int *muxid_fd, int *ipfd_lowstr, int *arpfd_lowstr, @@ -2467,8 +2510,8 @@ ip_domux2fd(int *muxfd, int *muxid_fd, int *ipfd_lowstr, int *arpfd_lowstr, * Return: * -1 if operation fails, 0 otherwise. * - * Please see the big block comment above plumb_one_device() - * for the logic of the PLINK/PUNLINK + * Please see the big block comment above ifplumb() for the logic of the + * PLINK/PUNLINK */ static int ip_plink(int muxfd, int muxid_fd, int ipfd_lowstr, int arpfd_lowstr, @@ -2530,7 +2573,12 @@ modop(char *arg, char op) char *arg_str; int orig_arpid; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (!modcheck(name)) + exit(1); /* Need to save the original string for -a option. */ if ((arg_str = malloc(strlen(arg) + 1)) == NULL) { @@ -3067,13 +3115,14 @@ status(void) static int configinfo(char *null, int64_t param) { + char *cp; struct afswtch *p = afp; uint64_t flags; - char phydevname[LIFNAMSIZ]; + char lifname[LIFNAMSIZ]; char if_usesrc_name[LIFNAMSIZ]; - char *cp; (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) { Perror0_exit("status: SIOCGLIFFLAGS"); } @@ -3084,13 +3133,13 @@ configinfo(char *null, int64_t param) name, flags, p != NULL ? p->af_af : -1); } - /* remove LIF component */ - (void) strncpy(phydevname, name, sizeof (phydevname)); - cp = strchr(phydevname, ':'); - if (cp) { - *cp = 0; - } - phydevname[sizeof (phydevname) - 1] = '\0'; + /* + * Build the interface name to print (we can't directly use `name' + * because one cannot "plumb" ":0" interfaces). + */ + (void) strlcpy(lifname, name, LIFNAMSIZ); + if ((cp = strchr(lifname, ':')) != NULL && atoi(cp + 1) == 0) + *cp = '\0'; /* * if the interface is IPv4 @@ -3105,7 +3154,7 @@ configinfo(char *null, int64_t param) if (v4compat) flags &= ~IFF_IPV4; - (void) printf("%s inet plumb", phydevname); + (void) printf("%s inet plumb", lifname); } else if (flags & IFF_IPV6) { /* * else if the interface is IPv6 @@ -3117,7 +3166,7 @@ configinfo(char *null, int64_t param) if (v4compat) return (-1); - (void) printf("%s inet6 plumb", phydevname); + (void) printf("%s inet6 plumb", lifname); } (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); @@ -3131,8 +3180,8 @@ configinfo(char *null, int64_t param) ioctl(s, SIOCGLIFMTU, (caddr_t)&lifr) >= 0) (void) printf(" mtu %d", lifr.lifr_metric); - /* don't print index when in compatibility mode */ - if (!v4compat) { + /* Index only applies to the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFINDEX, (caddr_t)&lifr) >= 0) (void) printf(" index %d", lifr.lifr_index); } @@ -3162,7 +3211,6 @@ configinfo(char *null, int64_t param) } (void) printf("\n"); - return (0); } @@ -3398,15 +3446,11 @@ in_status(int force, uint64_t flags) inet_ntoa(sin->sin_addr)); } } - /* If there is a groupname, print it for lun 0 alone */ + /* If there is a groupname, print it for only the physical interface */ if (strchr(name, ':') == NULL) { - (void) memset(lifr.lifr_groupname, 0, - sizeof (lifr.lifr_groupname)); - if (ioctl(s, SIOCGLIFGROUPNAME, (caddr_t)&lifr) >= 0) { - if (strlen(lifr.lifr_groupname) > 0) { - (void) printf("\n\tgroupname %s", - lifr.lifr_groupname); - } + if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && + lifr.lifr_groupname[0] != '\0') { + (void) printf("\n\tgroupname %s", lifr.lifr_groupname); } } (void) putchar('\n'); @@ -3550,11 +3594,7 @@ in_configinfo(int force, uint64_t flags) Perror0_exit("in_configinfo: SIOCGLIFADDR"); } sin = (struct sockaddr_in *)&lifr.lifr_addr; - if (strchr(name, ':') != NULL) { - (void) printf(" addif %s ", inet_ntoa(sin->sin_addr)); - } else { - (void) printf(" set %s ", inet_ntoa(sin->sin_addr)); - } + (void) printf(" set %s ", inet_ntoa(sin->sin_addr)); laddr = sin; } @@ -3614,8 +3654,8 @@ in_configinfo(int force, uint64_t flags) } } - /* If there is a groupname, print it for only the physical interface */ - if (strchr(name, ':') == NULL) { + /* If there is a groupname, print it for only the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && lifr.lifr_groupname[0] != '\0') { (void) printf(" group %s ", lifr.lifr_groupname); @@ -3623,12 +3663,7 @@ in_configinfo(int force, uint64_t flags) } /* Print flags to configure */ - print_config_flags(flags); - - /* IFF_NOARP applies to AF_INET only */ - if (flags & IFF_NOARP) { - (void) printf("-arp "); - } + print_config_flags(AF_INET, flags); } static void @@ -3657,17 +3692,9 @@ in6_configinfo(int force, uint64_t flags) Perror0_exit("in6_configinfo: SIOCGLIFADDR"); } sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; - if (strchr(name, ':') != NULL) { - (void) printf(" addif %s/%d ", - inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - abuf, sizeof (abuf)), - lifr.lifr_addrlen); - } else { - (void) printf(" set %s/%d ", - inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - abuf, sizeof (abuf)), - lifr.lifr_addrlen); - } + (void) printf(" set %s/%d ", + inet_ntop(AF_INET6, &sin6->sin6_addr, abuf, sizeof (abuf)), + lifr.lifr_addrlen); laddr6 = sin6; } (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); @@ -3720,8 +3747,8 @@ in6_configinfo(int force, uint64_t flags) lifr.lifr_addrlen); } - /* If there is a groupname, print it for only the physical interface */ - if (strchr(name, ':') == NULL) { + /* If there is a groupname, print it for only the zeroth interface */ + if (lifnum(name) == 0) { if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) >= 0 && lifr.lifr_groupname[0] != '\0') { (void) printf(" group %s ", lifr.lifr_groupname); @@ -3729,12 +3756,7 @@ in6_configinfo(int force, uint64_t flags) } /* Print flags to configure */ - print_config_flags(flags); - - /* IFF_NONUD applies to AF_INET6 only */ - if (flags & IFF_NONUD) { - (void) printf("-nud "); - } + print_config_flags(AF_INET6, flags); } /* @@ -3768,31 +3790,41 @@ in6_configinfo(int force, uint64_t flags) * compatibility for other utilities like atmifconfig etc. In this case * the utility must use SIOCSLIFMUXID. */ -static void -plumb_one_device(int af) +static int +ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af) { int arp_muxid = -1, ip_muxid; int mux_fd, ip_fd, arp_fd; int retval; - uint_t ppa; char *udp_dev_name; - char provider[DLPI_LINKNAME_MAX]; + uint64_t flags; + uint_t dlpi_flags; dlpi_handle_t dh_arp, dh_ip; /* - * We use DLPI_NOATTACH because the ip module will do the attach - * itself for DLPI style-2 devices. + * Always dlpi_open() with DLPI_NOATTACH because the IP and ARP module + * will do the attach themselves for DLPI style-2 links. */ - retval = dlpi_open(name, &dh_ip, DLPI_NOATTACH); - if (retval != DLPI_SUCCESS) - Perrdlpi_exit("cannot open link", name, retval); + dlpi_flags = DLPI_NOATTACH; - if ((retval = dlpi_parselink(name, provider, &ppa)) != DLPI_SUCCESS) - Perrdlpi_exit("dlpi_parselink", name, retval); + /* + * If `linkname' is the special token IPMPSTUB, then this is a request + * to create an IPMP interface atop /dev/ipmpstub0. (We can't simply + * pass "ipmpstub0" as `linkname' since an admin *could* have a normal + * vanity-named link named "ipmpstub0" that they'd like to plumb.) + */ + if (linkname == IPMPSTUB) { + linkname = "ipmpstub0"; + dlpi_flags |= DLPI_DEVONLY; + } + + retval = dlpi_open(linkname, &dh_ip, dlpi_flags); + if (retval != DLPI_SUCCESS) + Perrdlpi_exit("cannot open link", linkname, retval); if (debug) { - (void) printf("ifconfig: plumb_one_device: provider %s," - " ppa %u\n", provider, ppa); + (void) printf("ifconfig: ifplumb: link %s, ifname %s, " + "genppa %u\n", linkname, ifname, genppa); } ip_fd = dlpi_fd(dh_ip); @@ -3812,29 +3844,106 @@ plumb_one_device(int af) Perror2_exit("I_PUSH", ARP_MOD_NAME); /* - * Set IFF_IPV4/IFF_IPV6 flags. - * At this point in time the kernel also allows an - * override of the CANTCHANGE flags. + * Prepare to set IFF_IPV4/IFF_IPV6 flags as part of SIOCSLIFNAME. + * (At this point in time the kernel also allows an override of the + * IFF_CANTCHANGE flags.) */ lifr.lifr_name[0] = '\0'; if (ioctl(ip_fd, SIOCGLIFFLAGS, (char *)&lifr) == -1) - Perror0_exit("plumb_one_device: SIOCGLIFFLAGS"); + Perror0_exit("ifplumb: SIOCGLIFFLAGS"); - /* Set the name string and the IFF_IPV* flag */ if (af == AF_INET6) { - lifr.lifr_flags |= IFF_IPV6; - lifr.lifr_flags &= ~(IFF_BROADCAST | IFF_IPV4); + flags = lifr.lifr_flags | IFF_IPV6; + flags &= ~(IFF_BROADCAST | IFF_IPV4); } else { - lifr.lifr_flags |= IFF_IPV4; - lifr.lifr_flags &= ~IFF_IPV6; + flags = lifr.lifr_flags | IFF_IPV4; + flags &= ~IFF_IPV6; } - /* record the device and module names as interface name */ - lifr.lifr_ppa = ppa; - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + /* + * Set the interface name. If we've been asked to generate the PPA, + * then find the lowest available PPA (only currently used for IPMP + * interfaces). Otherwise, use the interface name as-is. + */ + if (genppa) { + int ppa; + + /* + * We'd like to just set lifr_ppa to UINT_MAX and have the + * kernel pick a PPA. Unfortunately, that would mishandle + * two cases: + * + * 1. If the PPA is available but the groupname is taken + * (e.g., the "ipmp2" IP interface name is available + * but the "ipmp2" groupname is taken) then the + * auto-assignment by the kernel will fail. + * + * 2. If we're creating (e.g.) an IPv6-only IPMP + * interface, and there's already an IPv4-only IPMP + * interface, the kernel will allow us to accidentally + * reuse the IPv6 IPMP interface name (since + * SIOCSLIFNAME uniqueness is per-interface-type). + * This will cause administrative confusion. + * + * Thus, we instead take a brute-force approach of checking + * whether the IPv4 or IPv6 name is already in-use before + * attempting the SIOCSLIFNAME. As per (1) above, the + * SIOCSLIFNAME may still fail, in which case we just proceed + * to the next one. If this approach becomes too slow, we + * can add a new SIOC* to handle this case in the kernel. + */ + for (ppa = 0; ppa < UINT_MAX; ppa++) { + (void) snprintf(lifr.lifr_name, LIFNAMSIZ, "%s%d", + ifname, ppa); + + if (ioctl(s4, SIOCGLIFFLAGS, &lifr) != -1 || + errno != ENXIO) + continue; + + if (ioctl(s6, SIOCGLIFFLAGS, &lifr) != -1 || + errno != ENXIO) + continue; + + lifr.lifr_ppa = ppa; + lifr.lifr_flags = flags; + retval = ioctl(ip_fd, SIOCSLIFNAME, &lifr); + if (retval != -1 || errno != EEXIST) + break; + } + } else { + ifspec_t ifsp; + + /* + * The interface name could have come from the command-line; + * check it. + */ + if (!ifparse_ifspec(ifname, &ifsp) || ifsp.ifsp_lunvalid) + Perror2_exit("invalid IP interface name", ifname); + + /* + * Before we call SIOCSLIFNAME, ensure that the IPMP group + * interface for this address family exists. Otherwise, the + * kernel will kick the interface out of the group when we do + * the SIOCSLIFNAME. + * + * Example: suppose bge0 is plumbed for IPv4 and in group "a". + * If we're now plumbing bge0 for IPv6, but the IPMP group + * interface for "a" is not plumbed for IPv6, the SIOCSLIFNAME + * will kick bge0 out of group "a", which is undesired. + */ + if (create_ipmp_peer(af, ifname) == -1) { + (void) fprintf(stderr, "ifconfig: warning: cannot " + "create %s IPMP group; %s will be removed from " + "group\n", af == AF_INET ? "IPv4" : "IPv6", ifname); + } - /* set the interface name */ - if (ioctl(ip_fd, SIOCSLIFNAME, (char *)&lifr) == -1) { + lifr.lifr_ppa = ifsp.ifsp_ppa; + lifr.lifr_flags = flags; + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + retval = ioctl(ip_fd, SIOCSLIFNAME, &lifr); + } + + if (retval == -1) { if (errno != EEXIST) Perror0_exit("SIOCSLIFNAME for ip"); /* @@ -3847,15 +3956,15 @@ plumb_one_device(int af) * called for EEXIST. */ Perror0("SIOCSLIFNAME for ip"); - return; + return (-1); } /* Get the full set of existing flags for this stream */ if (ioctl(ip_fd, SIOCGLIFFLAGS, (char *)&lifr) == -1) - Perror0_exit("plumb_one_device: SIOCFLIFFLAGS"); + Perror0_exit("ifplumb: SIOCGLIFFLAGS"); if (debug) { - (void) printf("ifconfig: plumb_one_device: %s got flags:\n", + (void) printf("ifconfig: ifplumb: %s got flags:\n", lifr.lifr_name); print_flags(lifr.lifr_flags); (void) putchar('\n'); @@ -3890,7 +3999,7 @@ plumb_one_device(int af) if ((ip_muxid = ioctl(mux_fd, I_PLINK, ip_fd)) == -1) Perror0_exit("I_PLINK for ip"); (void) close(mux_fd); - return; + return (lifr.lifr_ppa); } /* @@ -3901,15 +4010,11 @@ plumb_one_device(int af) * only on the interface stream, not on the ARP stream. */ if (debug) - (void) printf("ifconfig: plumb_one_device: ifname: %s\n", name); + (void) printf("ifconfig: ifplumb: interface %s", ifname); - /* - * We use DLPI_NOATTACH because the arp module will do the attach - * itself for DLPI style-2 devices. - */ - retval = dlpi_open(name, &dh_arp, DLPI_NOATTACH); + retval = dlpi_open(linkname, &dh_arp, dlpi_flags); if (retval != DLPI_SUCCESS) - Perrdlpi_exit("cannot open link", name, retval); + Perrdlpi_exit("cannot open link", linkname, retval); arp_fd = dlpi_fd(dh_arp); if (ioctl(arp_fd, I_PUSH, ARP_MOD_NAME) == -1) @@ -3919,16 +4024,13 @@ plumb_one_device(int af) * Tell ARP the name and unit number for this interface. * Note that arp has no support for transparent ioctls. */ - if (strioctl(arp_fd, SIOCSLIFNAME, (char *)&lifr, - sizeof (lifr)) == -1) { + if (strioctl(arp_fd, SIOCSLIFNAME, &lifr, sizeof (lifr)) == -1) { if (errno != EEXIST) Perror0_exit("SIOCSLIFNAME for arp"); Perror0("SIOCSLIFNAME for arp"); - dlpi_close(dh_arp); - dlpi_close(dh_ip); - (void) close(mux_fd); - return; + goto out; } + /* * PLINK the IP and ARP streams so that ifconfig can exit * without tearing down the stream. @@ -3942,12 +4044,13 @@ plumb_one_device(int af) if (debug) (void) printf("arp muxid = %d\n", arp_muxid); +out: dlpi_close(dh_ip); dlpi_close(dh_arp); (void) close(mux_fd); + return (lifr.lifr_ppa); } - /* * If this is a physical interface then remove it. * If it is a logical interface name use SIOCLIFREMOVEIF to @@ -3965,6 +4068,7 @@ inetunplumb(char *arg, int64_t param) uint64_t flags; boolean_t changed_arp_muxid = _B_FALSE; int save_errno; + boolean_t v6 = (afp->af_af == AF_INET6); strptr = strchr(name, ':'); if (strptr != NULL || strcmp(name, LOOPBACK_IF) == 0) { @@ -3986,7 +4090,7 @@ inetunplumb(char *arg, int64_t param) * We used /dev/udp or udp6 to set up the mux. So we have to use * the same now for PUNLINK also. */ - if (afp->af_af == AF_INET6) + if (v6) udp_dev_name = UDP6_DEV_NAME; else udp_dev_name = UDP_DEV_NAME; @@ -4002,6 +4106,50 @@ inetunplumb(char *arg, int64_t param) Perror0_exit("unplumb: SIOCGLIFFLAGS"); } flags = lifr.lifr_flags; + + if (flags & IFF_IPMP) { + lifgroupinfo_t lifgr; + ifaddrlistx_t *ifaddrs, *ifaddrp; + + /* + * The kernel will fail the I_PUNLINK if the group still has + * members, but check now to provide a better error message. + */ + if (ioctl(s, SIOCGLIFGROUPNAME, &lifr) == -1) + Perror0_exit("unplumb: SIOCGLIFGROUPNAME"); + + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, + LIFGRNAMSIZ); + if (ioctl(s, SIOCGLIFGROUPINFO, &lifgr) == -1) + Perror0_exit("unplumb: SIOCGLIFGROUPINFO"); + + if ((v6 && lifgr.gi_nv6 != 0) || (!v6 && lifgr.gi_nv4 != 0)) { + (void) fprintf(stderr, "ifconfig: %s: cannot unplumb:" + " IPMP group is not empty\n", name); + exit(1); + } + + /* + * The kernel will fail the I_PUNLINK if the IPMP interface + * has administratively up addresses; bring 'em down. + */ + if (ifaddrlistx(name, IFF_UP|IFF_DUPLICATE, 0, &ifaddrs) == -1) + Perror2_exit(name, "cannot get address list"); + + ifaddrp = ifaddrs; + for (; ifaddrp != NULL; ifaddrp = ifaddrp->ia_next) { + if (((ifaddrp->ia_flags & IFF_IPV6) && !v6) || + (!(ifaddrp->ia_flags & IFF_IPV6) && v6)) + continue; + + if (!ifaddr_down(ifaddrp)) { + Perror2_exit(ifaddrp->ia_name, + "cannot bring down"); + } + } + ifaddrlistx_free(ifaddrs); + } + if (ioctl(muxid_fd, SIOCGLIFMUXID, (caddr_t)&lifr) < 0) { Perror0_exit("unplumb: SIOCGLIFMUXID"); } @@ -4098,12 +4246,6 @@ inetplumb(char *arg, int64_t param) Perror2_exit("plumb: SIOCLIFADDIF", name); } } - /* - * IP can create the new logical interface on a different - * physical interface in the same IPMP group. Take the new - * interface into account for further operations. - */ - (void) strncpy(name, lifr.lifr_name, sizeof (name)); return (0); } @@ -4131,10 +4273,229 @@ inetplumb(char *arg, int64_t param) if (debug) (void) printf("inetplumb: %s af %d\n", name, afp->af_af); - plumb_one_device(afp->af_af); + (void) ifplumb(name, name, _B_FALSE, afp->af_af); + return (0); +} + +/* ARGSUSED */ +static int +inetipmp(char *arg, int64_t param) +{ + int retval; + + /* + * Treat e.g. "ifconfig ipmp0:2 ipmp" as "ifconfig ipmp0:2 plumb". + * Otherwise, try to create the requested IPMP interface. + */ + if (strchr(name, ':') != NULL) + retval = inetplumb(arg, param); + else + retval = create_ipmp(name, afp->af_af, name, _B_FALSE); + + /* + * We'd return -1, but foreachinterface() doesn't propagate the error + * into the exit status, so we're forced to explicitly exit(). + */ + if (retval == -1) + exit(1); return (0); } +/* + * Create an IPMP group `grname' with address family `af'. If `ifname' is + * non-NULL, it specifies the interface name to use. Otherwise, use the name + * ipmpN, where N corresponds to the lowest available integer. If `implicit' + * is set, then the group is being created as a side-effect of placing an + * underlying interface in a group. Also start in.mpathd if necessary. + */ +static int +create_ipmp(const char *grname, int af, const char *ifname, boolean_t implicit) +{ + int ppa; + static int ipmp_daemon_started; + + if (debug) { + (void) printf("create_ipmp: ifname %s grname %s af %d\n", + ifname != NULL ? ifname : "NULL", grname, af); + } + + if (ifname != NULL) + ppa = ifplumb(IPMPSTUB, ifname, _B_FALSE, af); + else + ppa = ifplumb(IPMPSTUB, "ipmp", _B_TRUE, af); + + if (ppa == -1) { + Perror2(grname, "cannot create IPMP interface"); + return (-1); + } + + if (ifname != NULL) + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + else + (void) snprintf(lifr.lifr_name, LIFNAMSIZ, "ipmp%d", ppa); + + /* + * To preserve backward-compatibility, always bring up the link-local + * address for implicitly-created IPv6 IPMP interfaces. + */ + if (implicit && af == AF_INET6) { + if (ioctl(s6, SIOCGLIFFLAGS, &lifr) == 0) { + lifr.lifr_flags |= IFF_UP; + (void) ioctl(s6, SIOCSLIFFLAGS, &lifr); + } + } + + /* + * If the caller requested a different group name, issue a + * SIOCSLIFGROUPNAME on the new IPMP interface. + */ + if (strcmp(lifr.lifr_name, grname) != 0) { + (void) strlcpy(lifr.lifr_groupname, grname, LIFGRNAMSIZ); + if (ioctl(s, SIOCSLIFGROUPNAME, &lifr) == -1) { + Perror0("SIOCSLIFGROUPNAME"); + return (-1); + } + } + + /* + * If we haven't done so yet, ensure in.mpathd is started. + */ + if (ipmp_daemon_started++ == 0) + start_ipmp_daemon(); + + return (0); +} + +/* + * Check if `ifname' is plumbed and in an IPMP group on its "other" address + * family. If so, create a matching IPMP group for address family `af'. + */ +static int +create_ipmp_peer(int af, const char *ifname) +{ + int fd; + lifgroupinfo_t lifgr; + + assert(af == AF_INET || af == AF_INET6); + + /* + * Get the socket for the "other" address family. + */ + fd = (af == AF_INET) ? s6 : s4; + + (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPNAME, &lifr) != 0) + return (0); + + (void) strlcpy(lifgr.gi_grname, lifr.lifr_groupname, LIFGRNAMSIZ); + if (ioctl(fd, SIOCGLIFGROUPINFO, &lifgr) != 0) + return (0); + + /* + * If `ifname' *is* the IPMP group interface, or if the relevant + * address family is already configured, then there's nothing to do. + */ + if (strcmp(lifgr.gi_grifname, ifname) == 0 || + (af == AF_INET && lifgr.gi_v4) || (af == AF_INET6 && lifgr.gi_v6)) + return (0); + + return (create_ipmp(lifgr.gi_grname, af, lifgr.gi_grifname, _B_TRUE)); +} + +/* + * Start in.mpathd if it's not already running. + */ +static void +start_ipmp_daemon(void) +{ + int retval; + ipmp_handle_t ipmp_handle; + + /* + * Ping in.mpathd to see if it's running already. + */ + if ((retval = ipmp_open(&ipmp_handle)) != IPMP_SUCCESS) { + (void) fprintf(stderr, "ifconfig: cannot create IPMP handle: " + "%s\n", ipmp_errmsg(retval)); + return; + } + + retval = ipmp_ping_daemon(ipmp_handle); + ipmp_close(ipmp_handle); + + switch (retval) { + case IPMP_ENOMPATHD: + break; + case IPMP_SUCCESS: + return; + default: + (void) fprintf(stderr, "ifconfig: cannot ping in.mpathd: %s\n", + ipmp_errmsg(retval)); + break; + } + + /* + * Start in.mpathd. Note that in.mpathd will handle multiple + * incarnations (ipmp_ping_daemon() is just an optimization) so we + * don't need to worry about racing with another ifconfig process. + */ + switch (fork()) { + case -1: + Perror0_exit("start_ipmp_daemon: fork"); + /* NOTREACHED */ + case 0: + (void) execl(MPATHD_PATH, MPATHD_PATH, NULL); + _exit(1); + /* NOTREACHED */ + default: + break; + } +} + +/* + * Bring the address named by `ifaddrp' up or down. Doesn't trust any mutable + * values in ia_flags since they may be stale. + */ +static boolean_t +ifaddr_op(ifaddrlistx_t *ifaddrp, boolean_t up) +{ + struct lifreq lifrl; /* Local lifreq struct */ + int fd = (ifaddrp->ia_flags & IFF_IPV4) ? s4 : s6; + + (void) memset(&lifrl, 0, sizeof (lifrl)); + (void) strlcpy(lifrl.lifr_name, ifaddrp->ia_name, LIFNAMSIZ); + if (ioctl(fd, SIOCGLIFFLAGS, &lifrl) == -1) + return (_B_FALSE); + + if (up) { + lifrl.lifr_flags |= IFF_UP; + } else { + /* + * If we've been asked to bring down an IFF_DUPLICATE address, + * then get the address and set it. This will cause IP to + * clear IFF_DUPLICATE and stop the automatic recovery timer. + */ + if (lifrl.lifr_flags & IFF_DUPLICATE) { + return (ioctl(fd, SIOCGLIFADDR, &lifrl) != -1 && + ioctl(fd, SIOCSLIFADDR, &lifrl) != -1); + } + lifrl.lifr_flags &= ~IFF_UP; + } + return (ioctl(fd, SIOCSLIFFLAGS, &lifrl) == 0); +} + +static boolean_t +ifaddr_up(ifaddrlistx_t *ifaddrp) +{ + return (ifaddr_op(ifaddrp, _B_TRUE)); +} + +static boolean_t +ifaddr_down(ifaddrlistx_t *ifaddrp) +{ + return (ifaddr_op(ifaddrp, _B_FALSE)); +} + void Perror0(const char *cmd) { @@ -4404,14 +4765,14 @@ print_flags(uint64_t flags) } static void -print_config_flags(uint64_t flags) +print_config_flags(int af, uint64_t flags) { - int cnt, i; + if_config_cmd_t *cmdp; - cnt = sizeof (if_config_cmd_tbl) / sizeof (if_config_cmd_t); - for (i = 0; i < cnt; i++) { - if (flags & if_config_cmd_tbl[i].iff_flag) { - (void) printf("%s ", if_config_cmd_tbl[i].iff_name); + for (cmdp = if_config_cmd_tbl; cmdp->iff_flag != 0; cmdp++) { + if ((flags & cmdp->iff_flag) && + (cmdp->iff_af == AF_UNSPEC || cmdp->iff_af == af)) { + (void) printf("%s ", cmdp->iff_name); } } } @@ -4454,7 +4815,18 @@ in_getmask(struct sockaddr_in *saddr, boolean_t addr_set) } static int -strioctl(int s, int cmd, char *buf, int buflen) +lifnum(const char *ifname) +{ + const char *cp; + + if ((cp = strchr(ifname, ':')) == NULL) + return (0); + else + return (atoi(cp + 1)); +} + +static int +strioctl(int s, int cmd, void *buf, int buflen) { struct strioctl ioc; @@ -4681,6 +5053,7 @@ usage(void) "\t[ modlist ]\n" "\t[ modinsert <module_name@position> ]\n" "\t[ modremove <module_name@position> ]\n" + "\t[ ipmp ]\n" "\t[ group <groupname>] | [ group \"\"]\n" "\t[ deprecated | -deprecated ]\n" "\t[ standby | -standby ]\n" diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h index 0ac600001f..f11f4d0a94 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -11,8 +11,6 @@ #ifndef _IFCONFIG_H #define _IFCONFIG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -39,7 +37,6 @@ extern void Perrdlpi_exit(const char *, const char *, int); extern int doifrevarp(const char *, struct sockaddr_in *); -extern int dlpi_set_address(const char *, uchar_t *, uint_t); extern void dlpi_print_address(const char *); #ifdef __cplusplus diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c index 725c8b24c3..aba4794942 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "ifconfig.h" #include <sys/types.h> @@ -88,6 +86,7 @@ doifrevarp(const char *linkname, struct sockaddr_in *laddr) /* don't try to revarp if we know it won't work */ if ((lifr.lifr_flags & IFF_LOOPBACK) || (lifr.lifr_flags & IFF_NOARP) || + (lifr.lifr_flags & IFF_IPMP) || (lifr.lifr_flags & IFF_POINTOPOINT)) { (void) close(s); return (0); @@ -326,28 +325,6 @@ rarp_recv(dlpi_handle_t dh, struct arphdr *ans, size_t msglen, return (DLPI_ETIMEDOUT); } -int -dlpi_set_address(const char *linkname, uchar_t *physaddr, uint_t physaddrlen) -{ - int retval; - dlpi_handle_t dh; - - if ((retval = dlpi_open(linkname, &dh, 0)) != DLPI_SUCCESS) { - Perrdlpi("dlpi_open failed", linkname, retval); - return (-1); - } - - if ((retval = dlpi_set_physaddr(dh, DL_CURR_PHYS_ADDR, physaddr, - physaddrlen)) != DLPI_SUCCESS) { - Perrdlpi("dlpi_set_physaddr failed", linkname, retval); - dlpi_close(dh); - return (-1); - } - - dlpi_close(dh); - return (0); -} - void dlpi_print_address(const char *linkname) { diff --git a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h index 900b5841ed..5cca3ecb2e 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h +++ b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/defs.h @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 1983, 1988, 1993 @@ -414,16 +414,9 @@ struct interface { (IS_REMOTE|IS_PASSIVE)) /* - * Is an IP interface up? Because of the way IPMP uses deprecated - * interfaces, we need to check more than the IFF_UP and IFF_RUNNING - * interface flags here. Basically, we do not want to use IFF_DEPRECATED - * interfaces unless they are also IFF_STANDBY and not IFF_INACTIVE. + * Is an IP interface up? */ -#define IFF_GOOD (IFF_UP|IFF_RUNNING) -#define IS_IFF_UP(f) \ - ((((f) & (IFF_GOOD|IFF_DEPRECATED)) == IFF_GOOD) || \ - (((f) & (IFF_GOOD|IFF_INACTIVE|IFF_STANDBY)) == \ - (IFF_GOOD|IFF_STANDBY))) +#define IS_IFF_UP(f) (((f) & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING)) /* * This defines interfaces that we should not use for advertising or diff --git a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c index 79ae02e703..a3a26ac2cb 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/in.routed/trace.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 1983, 1988, 1993 @@ -36,8 +36,6 @@ * $FreeBSD: src/sbin/routed/trace.c,v 1.6 2000/08/11 08:24:38 sheldonh Exp $ */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "defs.h" #include "pathnames.h" #include <signal.h> @@ -566,6 +564,7 @@ static struct bits if_bits[] = { { IFF_TEMPORARY, 0, "TEMPORARY" }, { IFF_FIXEDMTU, 0, "FIXEDMTU" }, { IFF_VIRTUAL, 0, "VIRTUAL"}, + { IFF_IPMP, 0, "IPMP"}, { 0, 0, NULL} }; @@ -898,8 +897,8 @@ trace_upslot(struct rt_entry *rt, print_rts(rts, 0, 0, rts->rts_gate != new->rts_gate, rts->rts_tag != new->rts_tag, - rts != rt->rt_spares || AGE_RT(rt->rt_state, - rts->rts_origin, rt->rt_ifp)); + rts != rt->rt_spares || + AGE_RT(rt->rt_state, rts->rts_origin, rt->rt_ifp)); (void) fprintf(ftrace, "\n %19s%-16s ", "", (new->rts_gate != rts->rts_gate ? @@ -1173,10 +1172,9 @@ trace_rip(const char *dir1, const char *dir2, if (NA->a_type == RIP_AUTH_PW && n == msg->rip_nets) { (void) fprintf(ftrace, "\tPassword" - " Authentication:" - " \"%s\"\n", + " Authentication: \"%s\"\n", qstring(NA->au.au_pw, - RIP_AUTH_PW_LEN)); + RIP_AUTH_PW_LEN)); continue; } @@ -1186,13 +1184,12 @@ trace_rip(const char *dir1, const char *dir2, "\tMD5 Auth" " pkt_len=%d KeyID=%u" " auth_len=%d" - " seqno=%#lx" - " rsvd=%#x,%#x\n", + " seqno=%#x" + " rsvd=%#hx,%#hx\n", ntohs(NA->au.a_md5.md5_pkt_len), NA->au.a_md5.md5_keyid, NA->au.a_md5.md5_auth_len, - (unsigned long)ntohl(NA->au.a_md5. - md5_seqno), + ntohl(NA->au.a_md5.md5_seqno), ntohs(NA->au.a_md5.rsvd[0]), ntohs(NA->au.a_md5.rsvd[1])); continue; @@ -1217,14 +1214,12 @@ trace_rip(const char *dir1, const char *dir2, inet_ntoa(tmp_mask)); } else if (msg->rip_vers == RIPv1) { (void) fprintf(ftrace, "\t%-18s ", - addrname(n->n_dst, - ntohl(n->n_mask), - n->n_mask == 0 ? 2 : 1)); + addrname(n->n_dst, ntohl(n->n_mask), + n->n_mask == 0 ? 2 : 1)); } else { (void) fprintf(ftrace, "\t%-18s ", - addrname(n->n_dst, - ntohl(n->n_mask), - n->n_mask == 0 ? 2 : 0)); + addrname(n->n_dst, ntohl(n->n_mask), + n->n_mask == 0 ? 2 : 0)); } (void) fprintf(ftrace, "metric=%-2lu ", (unsigned long)ntohl(n->n_metric)); @@ -1242,8 +1237,8 @@ trace_rip(const char *dir1, const char *dir2, break; case RIPCMD_TRACEON: - (void) fprintf(ftrace, "\tfile=\"%.*s\"\n", size-4, - msg->rip_tracefile); + (void) fprintf(ftrace, "\tfile=\"%.*s\"\n", size - 4, + msg->rip_tracefile); break; case RIPCMD_TRACEOFF: diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile new file mode 100644 index 0000000000..a256cf5f49 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/Makefile @@ -0,0 +1,48 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +PROG = ipmpstat +ROOTFS_PROG = $(PROG) +ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%) + +include $(SRC)/cmd/Makefile.cmd + +C99MODE = $(C99_ENABLE) +LDLIBS += -lipmp -lsocket -lsysevent -lnvpair +XGETFLAGS += -a -x $(PROG).xcl + +.KEEP_STATE: + +all: $(PROG) + +install: all $(ROOTSBINPROG) $(ROOTUSRSBINLINKS) + +clean: + +lint: lint_PROG + +$(ROOTUSRSBINLINKS): + -$(RM) $@; $(SYMLINK) ../../sbin/$(@F) $@ + +include $(SRC)/cmd/Makefile.targ diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c new file mode 100644 index 0000000000..4620c34a24 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.c @@ -0,0 +1,1498 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <alloca.h> +#include <arpa/inet.h> +#include <assert.h> +#include <errno.h> +#include <ipmp_admin.h> +#include <ipmp_query.h> +#include <libintl.h> +#include <libnvpair.h> +#include <libsysevent.h> +#include <locale.h> +#include <netdb.h> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/ipmp.h> +#include <sys/sysmacros.h> +#include <sys/termios.h> +#include <sys/types.h> + +/* + * ipmpstat -- display IPMP subsystem status. + * + * This utility makes extensive use of libipmp and IPMP sysevents to gather + * and pretty-print the status of the IPMP subsystem. All output formats + * except for -p (probe) use libipmp to create a point-in-time snapshot of the + * IPMP subsystem (unless the test-special -L flag is used), and then output + * the contents of that snapshot in a user-specified manner. Because the + * output format and requested fields aren't known until run-time, three sets + * of function pointers and two core data structures are used. Specifically: + * + * * The ipmpstat_walker_t function pointers (walk_*) iterate through + * all instances of a given IPMP object (group, interface, or address). + * At most one ipmpstat_walker_t is used per ipmpstat invocation. + * Since target information is included with the interface information, + * both -i and -t use the interface walker (walk_if()). + * + * * The ipmpstat_sfunc_t function pointers (sfunc_*) obtain a given + * value for a given IPMP object. Each ipmpstat_sunc_t is passed a + * buffer to write its result into, the buffer's size, and an + * ipmpstat_sfunc_arg_t state structure. The state structure consists + * of a pointer to the IPMP object to obtain information from + * (sa_data), and an open libipmp handle (sa_ih) which can be used to + * do additional libipmp queries, if necessary (e.g., because the + * object does not have all of the needed information). + * + * * The ipmpstat_field_t structure provides the list of supported fields + * for a given output format, along with output formatting information + * (e.g., field width), and a pointer to an ipmpstat_sfunc_t function + * that can obtain the value for a IPMP given object. For a given + * ipmpstat output format, there's a corresponding array of + * ipmpstat_field_t structures. Thus, one ipmpstat_field_t array is + * used per ipmpstat invocation. + * + * * The ipmpstat_ofmt_t provides an ordered list of the requested + * ipmpstat_field_t's (e.g., via -o) for a given ipmpstat invocation. + * It is built at runtime from the command-line arguments. This + * structure (and a given IPMP object) is used by ofmt_output() to + * output a single line of information about that IPMP object. + * + * * The ipmpstat_cbfunc_t function pointers (*_cbfunc) are called back + * by the walkers. They are used both internally to implement nested + * walks, and by the ipmpstat output logic to provide the glue between + * the IPMP object walkers and the ofmt_output() logic. Usually, a + * single line is output for each IPMP object, and thus ofmt_output() + * can be directly invoked (see info_output_cbfunc()). However, if + * multiple lines need to be output, then a more complex cbfunc is + * needed (see targinfo_output_cbfunc()). At most one cbfunc is used + * per ipmpstat invocation. + */ + +/* + * Data type used by the sfunc callbacks to obtain the requested information + * from the agreed-upon object. + */ +typedef struct ipmpstat_sfunc_arg { + ipmp_handle_t sa_ih; + void *sa_data; +} ipmpstat_sfunc_arg_t; + +typedef void ipmpstat_sfunc_t(ipmpstat_sfunc_arg_t *, char *, uint_t); + +/* + * Data type that describes how to output a field; used by ofmt_output*(). + */ +typedef struct ipmpstat_field { + const char *f_name; /* field name */ + uint_t f_width; /* output width */ + ipmpstat_sfunc_t *f_sfunc; /* value->string function */ +} ipmpstat_field_t; + +/* + * Data type that specifies the output field order; used by ofmt_output*() + */ +typedef struct ipmpstat_ofmt { + const ipmpstat_field_t *o_field; /* current field info */ + struct ipmpstat_ofmt *o_next; /* next field */ +} ipmpstat_ofmt_t; + +/* + * Function pointers used to iterate through IPMP objects. + */ +typedef void ipmpstat_cbfunc_t(ipmp_handle_t, void *, void *); +typedef void ipmpstat_walker_t(ipmp_handle_t, ipmpstat_cbfunc_t *, void *); + +/* + * Data type used to implement nested walks. + */ +typedef struct ipmpstat_walkdata { + ipmpstat_cbfunc_t *iw_func; /* caller-specified callback */ + void *iw_funcarg; /* caller-specified arg */ +} ipmpstat_walkdata_t; + +/* + * Data type used by enum2str() to map an enumerated value to a string. + */ +typedef struct ipmpstat_enum { + const char *e_name; /* string */ + int e_val; /* value */ +} ipmpstat_enum_t; + +/* + * Data type used to pass state between probe_output() and probe_event(). + */ +typedef struct ipmpstat_probe_state { + ipmp_handle_t ps_ih; /* open IPMP handle */ + ipmpstat_ofmt_t *ps_ofmt; /* requested ofmt string */ +} ipmpstat_probe_state_t; + +/* + * Options that modify the output mode; more than one may be lit. + */ +typedef enum { + IPMPSTAT_OPT_NUMERIC = 0x1, + IPMPSTAT_OPT_PARSABLE = 0x2 +} ipmpstat_opt_t; + +/* + * Indices for the FLAGS field of the `-i' output format. + */ +enum { + IPMPSTAT_IFLAG_INDEX, IPMPSTAT_SFLAG_INDEX, IPMPSTAT_M4FLAG_INDEX, + IPMPSTAT_BFLAG_INDEX, IPMPSTAT_M6FLAG_INDEX, IPMPSTAT_DFLAG_INDEX, + IPMPSTAT_HFLAG_INDEX, IPMPSTAT_NUM_FLAGS +}; + +#define IPMPSTAT_NCOL 80 +#define NS2FLOATMS(ns) ((float)(ns) / (NANOSEC / MILLISEC)) +#define MS2FLOATSEC(ms) ((float)(ms) / 1000) + +static const char *progname; +static hrtime_t probe_output_start; +static struct winsize winsize; +static ipmpstat_opt_t opt; +static ipmpstat_enum_t addr_state[], group_state[], if_state[], if_link[]; +static ipmpstat_enum_t if_probe[], targ_mode[]; +static ipmpstat_field_t addr_fields[], group_fields[], if_fields[]; +static ipmpstat_field_t probe_fields[], targ_fields[]; +static ipmpstat_cbfunc_t walk_addr_cbfunc, walk_if_cbfunc; +static ipmpstat_cbfunc_t info_output_cbfunc, targinfo_output_cbfunc; +static ipmpstat_walker_t walk_addr, walk_if, walk_group; + +static int probe_event(sysevent_t *, void *); +static void probe_output(ipmp_handle_t, ipmpstat_ofmt_t *); +static ipmpstat_field_t *field_find(ipmpstat_field_t *, const char *); +static ipmpstat_ofmt_t *ofmt_create(const char *, ipmpstat_field_t []); +static void ofmt_output(const ipmpstat_ofmt_t *, ipmp_handle_t, void *); +static void ofmt_destroy(ipmpstat_ofmt_t *); +static void enum2str(const ipmpstat_enum_t *, int, char *, uint_t); +static void sockaddr2str(const struct sockaddr_storage *, char *, uint_t); +static void sighandler(int); +static void usage(void); +static void die(const char *, ...); +static void die_ipmperr(int, const char *, ...); +static void warn(const char *, ...); +static void warn_ipmperr(int, const char *, ...); + +int +main(int argc, char **argv) +{ + int c; + int err; + const char *ofields = NULL; + ipmp_handle_t ih; + ipmp_qcontext_t qcontext = IPMP_QCONTEXT_SNAP; + ipmpstat_ofmt_t *ofmt; + ipmpstat_field_t *fields = NULL; + ipmpstat_cbfunc_t *cbfunc; + ipmpstat_walker_t *walker; + + if ((progname = strrchr(argv[0], '/')) == NULL) + progname = argv[0]; + else + progname++; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + while ((c = getopt(argc, argv, "nLPo:agipt")) != EOF) { + if (fields != NULL && strchr("agipt", c) != NULL) + die("only one output format may be specified\n"); + + switch (c) { + case 'n': + opt |= IPMPSTAT_OPT_NUMERIC; + break; + case 'L': + /* Undocumented option: for testing use ONLY */ + qcontext = IPMP_QCONTEXT_LIVE; + break; + case 'P': + opt |= IPMPSTAT_OPT_PARSABLE; + break; + case 'o': + ofields = optarg; + break; + case 'a': + walker = walk_addr; + cbfunc = info_output_cbfunc; + fields = addr_fields; + break; + case 'g': + walker = walk_group; + cbfunc = info_output_cbfunc; + fields = group_fields; + break; + case 'i': + walker = walk_if; + cbfunc = info_output_cbfunc; + fields = if_fields; + break; + case 'p': + fields = probe_fields; + break; + case 't': + walker = walk_if; + cbfunc = targinfo_output_cbfunc; + fields = targ_fields; + break; + default: + usage(); + break; + } + } + + if (argc > optind || fields == NULL) + usage(); + + if (opt & IPMPSTAT_OPT_PARSABLE) { + if (ofields == NULL) { + die("output field list (-o) required in parsable " + "output mode\n"); + } else if (strcasecmp(ofields, "all") == 0) { + die("\"all\" not allowed in parsable output mode\n"); + } + } + + /* + * Obtain the window size and monitor changes to the size. This data + * is used to redisplay the output headers when necessary. + */ + (void) sigset(SIGWINCH, sighandler); + sighandler(SIGWINCH); + + if ((err = ipmp_open(&ih)) != IPMP_SUCCESS) + die_ipmperr(err, "cannot create IPMP handle"); + + if (ipmp_ping_daemon(ih) != IPMP_SUCCESS) + die("cannot contact in.mpathd(1M) -- is IPMP in use?\n"); + + /* + * Create the ofmt linked list that will eventually be passed to + * to ofmt_output() to output the fields. + */ + ofmt = ofmt_create(ofields, fields); + + /* + * If we've been asked to display probes, then call the probe output + * function. Otherwise, snapshot IPMP state (or use live state) and + * invoke the specified walker with the specified callback function. + */ + if (fields == probe_fields) { + probe_output(ih, ofmt); + } else { + if ((err = ipmp_setqcontext(ih, qcontext)) != IPMP_SUCCESS) { + if (qcontext == IPMP_QCONTEXT_SNAP) + die_ipmperr(err, "cannot snapshot IPMP state"); + else + die_ipmperr(err, "cannot use live IPMP state"); + } + (*walker)(ih, cbfunc, ofmt); + } + + ofmt_destroy(ofmt); + ipmp_close(ih); + + return (EXIT_SUCCESS); +} + +/* + * Walks all IPMP groups on the system and invokes `cbfunc' on each, passing + * it `ih', the ipmp_groupinfo_t pointer, and `arg'. + */ +static void +walk_group(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop; + ipmp_grouplist_t *grlistp; + + if ((err = ipmp_getgrouplist(ih, &grlistp)) != IPMP_SUCCESS) + die_ipmperr(err, "cannot get IPMP group list"); + + for (i = 0; i < grlistp->gl_ngroup; i++) { + err = ipmp_getgroupinfo(ih, grlistp->gl_groups[i], &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + grlistp->gl_groups[i]); + continue; + } + (*cbfunc)(ih, grinfop, arg); + ipmp_freegroupinfo(grinfop); + } + + ipmp_freegrouplist(grlistp); +} + +/* + * Walks all IPMP interfaces on the system and invokes `cbfunc' on each, + * passing it `ih', the ipmp_ifinfo_t pointer, and `arg'. + */ +static void +walk_if(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + ipmpstat_walkdata_t iw = { cbfunc, arg }; + + walk_group(ih, walk_if_cbfunc, &iw); +} + +/* + * Walks all IPMP data addresses on the system and invokes `cbfunc' on each. + * passing it `ih', the ipmp_addrinfo_t pointer, and `arg'. + */ +static void +walk_addr(ipmp_handle_t ih, ipmpstat_cbfunc_t *cbfunc, void *arg) +{ + ipmpstat_walkdata_t iw = { cbfunc, arg }; + + walk_group(ih, walk_addr_cbfunc, &iw); +} + +/* + * Nested walker callback function for walk_if(). + */ +static void +walk_if_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop = infop; + ipmp_ifinfo_t *ifinfop; + ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmpstat_walkdata_t *iwp = arg; + + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + (*iwp->iw_func)(ih, ifinfop, iwp->iw_funcarg); + ipmp_freeifinfo(ifinfop); + } +} + +/* + * Nested walker callback function for walk_addr(). + */ +static void +walk_addr_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + int err; + uint_t i; + ipmp_groupinfo_t *grinfop = infop; + ipmp_addrinfo_t *adinfop; + ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; + ipmpstat_walkdata_t *iwp = arg; + char addr[INET6_ADDRSTRLEN]; + struct sockaddr_storage *addrp; + + for (i = 0; i < adlistp->al_naddr; i++) { + addrp = &adlistp->al_addrs[i]; + err = ipmp_getaddrinfo(ih, grinfop->gr_name, addrp, &adinfop); + if (err != IPMP_SUCCESS) { + sockaddr2str(addrp, addr, sizeof (addr)); + warn_ipmperr(err, "cannot get info for `%s'", addr); + continue; + } + (*iwp->iw_func)(ih, adinfop, iwp->iw_funcarg); + ipmp_freeaddrinfo(adinfop); + } +} + +static void +sfunc_nvwarn(const char *nvname, char *buf, uint_t bufsize) +{ + warn("cannot retrieve %s\n", nvname); + (void) strlcpy(buf, "?", bufsize); +} + +static void +sfunc_addr_address(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + sockaddr2str(&adinfop->ad_addr, buf, bufsize); +} + +static void +sfunc_addr_group(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_addrinfo_t *adinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + err = ipmp_getgroupinfo(arg->sa_ih, adinfop->ad_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + adinfop->ad_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_addr_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + enum2str(addr_state, adinfop->ad_state, buf, bufsize); +} + +static void +sfunc_addr_inbound(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_addrinfo_t *adinfop = arg->sa_data; + + (void) strlcpy(buf, adinfop->ad_binding, bufsize); +} + +static void +sfunc_addr_outbound(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + uint_t i, nactive = 0; + ipmp_ifinfo_t *ifinfop; + ipmp_iflist_t *iflistp; + ipmp_addrinfo_t *adinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + if (adinfop->ad_state == IPMP_ADDR_DOWN) + return; + + /* + * If there's no inbound interface for this address, there can't + * be any outbound traffic. + */ + if (adinfop->ad_binding[0] == '\0') + return; + + /* + * The address can use any active interface in the group, so + * obtain all of those. + */ + err = ipmp_getgroupinfo(arg->sa_ih, adinfop->ad_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + adinfop->ad_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + + iflistp = grinfop->gr_iflistp; + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(arg->sa_ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) { + if (nactive++ != 0) + (void) strlcat(buf, " ", bufsize); + (void) strlcat(buf, ifinfop->if_name, bufsize); + } + ipmp_freeifinfo(ifinfop); + } + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_group_name(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + (void) strlcpy(buf, grinfop->gr_name, bufsize); +} + +static void +sfunc_group_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); +} + +static void +sfunc_group_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + enum2str(group_state, grinfop->gr_state, buf, bufsize); +} + +static void +sfunc_group_fdt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_groupinfo_t *grinfop = arg->sa_data; + + if (grinfop->gr_fdt == 0) + return; + + (void) snprintf(buf, bufsize, "%.2fs", MS2FLOATSEC(grinfop->gr_fdt)); +} + +static void +sfunc_group_interfaces(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + uint_t i; + char *active, *inactive, *unusable; + uint_t nactive = 0, ninactive = 0, nunusable = 0; + ipmp_groupinfo_t *grinfop = arg->sa_data; + ipmp_iflist_t *iflistp = grinfop->gr_iflistp; + ipmp_ifinfo_t *ifinfop; + + active = alloca(bufsize); + active[0] = '\0'; + inactive = alloca(bufsize); + inactive[0] = '\0'; + unusable = alloca(bufsize); + unusable[0] = '\0'; + + for (i = 0; i < iflistp->il_nif; i++) { + err = ipmp_getifinfo(arg->sa_ih, iflistp->il_ifs[i], &ifinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for interface `%s'", + iflistp->il_ifs[i]); + continue; + } + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) { + if (nactive++ != 0) + (void) strlcat(active, " ", bufsize); + (void) strlcat(active, ifinfop->if_name, bufsize); + } else if (ifinfop->if_flags & IPMP_IFFLAG_INACTIVE) { + if (ninactive++ != 0) + (void) strlcat(inactive, " ", bufsize); + (void) strlcat(inactive, ifinfop->if_name, bufsize); + } else { + if (nunusable++ != 0) + (void) strlcat(unusable, " ", bufsize); + (void) strlcat(unusable, ifinfop->if_name, bufsize); + } + + ipmp_freeifinfo(ifinfop); + } + + (void) strlcpy(buf, active, bufsize); + + if (ninactive > 0) { + if (nactive != 0) + (void) strlcat(buf, " ", bufsize); + + (void) strlcat(buf, "(", bufsize); + (void) strlcat(buf, inactive, bufsize); + (void) strlcat(buf, ")", bufsize); + } + + if (nunusable > 0) { + if (nactive + ninactive != 0) + (void) strlcat(buf, " ", bufsize); + + (void) strlcat(buf, "[", bufsize); + (void) strlcat(buf, unusable, bufsize); + (void) strlcat(buf, "]", bufsize); + } +} + +static void +sfunc_if_name(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + (void) strlcpy(buf, ifinfop->if_name, bufsize); +} + +static void +sfunc_if_active(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + if (ifinfop->if_flags & IPMP_IFFLAG_ACTIVE) + (void) strlcpy(buf, "yes", bufsize); + else + (void) strlcpy(buf, "no", bufsize); +} + +static void +sfunc_if_group(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_ifinfo_t *ifinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + err = ipmp_getgroupinfo(arg->sa_ih, ifinfop->if_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get info for group `%s'", + ifinfop->if_group); + (void) strlcpy(buf, "?", bufsize); + return; + } + + (void) strlcpy(buf, grinfop->gr_ifname, bufsize); + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_if_flags(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int err; + ipmp_ifinfo_t *ifinfop = arg->sa_data; + ipmp_groupinfo_t *grinfop; + + assert(bufsize > IPMPSTAT_NUM_FLAGS); + + (void) memset(buf, '-', IPMPSTAT_NUM_FLAGS); + buf[IPMPSTAT_NUM_FLAGS] = '\0'; + + if (ifinfop->if_type == IPMP_IF_STANDBY) + buf[IPMPSTAT_SFLAG_INDEX] = 's'; + + if (ifinfop->if_flags & IPMP_IFFLAG_INACTIVE) + buf[IPMPSTAT_IFLAG_INDEX] = 'i'; + + if (ifinfop->if_flags & IPMP_IFFLAG_DOWN) + buf[IPMPSTAT_DFLAG_INDEX] = 'd'; + + if (ifinfop->if_flags & IPMP_IFFLAG_HWADDRDUP) + buf[IPMPSTAT_HFLAG_INDEX] = 'h'; + + err = ipmp_getgroupinfo(arg->sa_ih, ifinfop->if_group, &grinfop); + if (err != IPMP_SUCCESS) { + warn_ipmperr(err, "cannot get broadcast/multicast info for " + "group `%s'", ifinfop->if_group); + return; + } + + if (strcmp(grinfop->gr_m4ifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_M4FLAG_INDEX] = 'm'; + + if (strcmp(grinfop->gr_m6ifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_M6FLAG_INDEX] = 'M'; + + if (strcmp(grinfop->gr_bcifname, ifinfop->if_name) == 0) + buf[IPMPSTAT_BFLAG_INDEX] = 'b'; + + ipmp_freegroupinfo(grinfop); +} + +static void +sfunc_if_link(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_link, ifinfop->if_linkstate, buf, bufsize); +} + +static void +sfunc_if_probe(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_probe, ifinfop->if_probestate, buf, bufsize); +} + +static void +sfunc_if_state(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_ifinfo_t *ifinfop = arg->sa_data; + + enum2str(if_state, ifinfop->if_state, buf, bufsize); +} + +static void +sfunc_probe_id(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint32_t probe_id; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_ID, &probe_id) != 0) { + sfunc_nvwarn("IPMP_PROBE_ID", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%u", probe_id); +} + +static void +sfunc_probe_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + char *ifname; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_string(nvl, IPMP_IF_NAME, &ifname) != 0) { + sfunc_nvwarn("IPMP_IF_NAME", buf, bufsize); + return; + } + + (void) strlcpy(buf, ifname, bufsize); +} + +static void +sfunc_probe_time(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t start; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_START_TIME, &start) != 0) { + sfunc_nvwarn("IPMP_PROBE_START_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fs", + (float)(start - probe_output_start) / NANOSEC); +} + +static void +sfunc_probe_target(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint_t nelem; + struct sockaddr_storage *target; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_byte_array(nvl, IPMP_PROBE_TARGET, + (uchar_t **)&target, &nelem) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET", buf, bufsize); + return; + } + + sockaddr2str(target, buf, bufsize); +} + +static void +sfunc_probe_rtt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t start, ackproc; + nvlist_t *nvl = arg->sa_data; + uint32_t state; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + sfunc_nvwarn("IPMP_PROBE_STATE", buf, bufsize); + return; + } + + if (state != IPMP_PROBE_ACKED) + return; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_START_TIME, &start) != 0) { + sfunc_nvwarn("IPMP_PROBE_START_TIME", buf, bufsize); + return; + } + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, &ackproc) != 0) { + sfunc_nvwarn("IPMP_PROBE_ACKPROC_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(ackproc - start)); +} + +static void +sfunc_probe_netrtt(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + hrtime_t sent, ackrecv; + nvlist_t *nvl = arg->sa_data; + uint32_t state; + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + sfunc_nvwarn("IPMP_PROBE_STATE", buf, bufsize); + return; + } + + if (state != IPMP_PROBE_ACKED) + return; + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_SENT_TIME, &sent) != 0) { + sfunc_nvwarn("IPMP_PROBE_SENT_TIME", buf, bufsize); + return; + } + + if (nvlist_lookup_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, &ackrecv) != 0) { + sfunc_nvwarn("IPMP_PROBE_ACKRECV_TIME", buf, bufsize); + return; + } + + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(ackrecv - sent)); +} + +static void +sfunc_probe_rttavg(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int64_t rttavg; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, &rttavg) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET_RTTAVG", buf, bufsize); + return; + } + + if (rttavg != 0) + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(rttavg)); +} + +static void +sfunc_probe_rttdev(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + int64_t rttdev; + nvlist_t *nvl = arg->sa_data; + + if (nvlist_lookup_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, &rttdev) != 0) { + sfunc_nvwarn("IPMP_PROBE_TARGET_RTTDEV", buf, bufsize); + return; + } + + if (rttdev != 0) + (void) snprintf(buf, bufsize, "%.2fms", NS2FLOATMS(rttdev)); +} + +/* ARGSUSED */ +static void +probe_enabled_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + uint_t *nenabledp = arg; + ipmp_ifinfo_t *ifinfop = infop; + + if (ifinfop->if_probestate != IPMP_PROBE_DISABLED) + (*nenabledp)++; +} + +static void +probe_output(ipmp_handle_t ih, ipmpstat_ofmt_t *ofmt) +{ + char sub[MAX_SUBID_LEN]; + evchan_t *evch; + ipmpstat_probe_state_t ps = { ih, ofmt }; + uint_t nenabled = 0; + + /* + * Check if any interfaces are enabled for probe-based failure + * detection. If not, immediately fail. + */ + walk_if(ih, probe_enabled_cbfunc, &nenabled); + if (nenabled == 0) + die("probe-based failure detection is disabled\n"); + + probe_output_start = gethrtime(); + + /* + * Unfortunately, until 4791900 is fixed, only privileged processes + * can bind and thus receive sysevents. + */ + errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evch, EVCH_CREAT); + if (errno != 0) { + if (errno == EPERM) + die("insufficient privileges for -p\n"); + die("sysevent_evc_bind to channel %s failed", IPMP_EVENT_CHAN); + } + + /* + * The subscriber must be unique in order for sysevent_evc_subscribe() + * to succeed, so combine our name and pid. + */ + (void) snprintf(sub, sizeof (sub), "%d-%s", getpid(), progname); + + errno = sysevent_evc_subscribe(evch, sub, EC_IPMP, probe_event, &ps, 0); + if (errno != 0) + die("sysevent_evc_subscribe for class %s failed", EC_IPMP); + + for (;;) + (void) pause(); +} + +static int +probe_event(sysevent_t *ev, void *arg) +{ + nvlist_t *nvl; + uint32_t state; + uint32_t version; + ipmpstat_probe_state_t *psp = arg; + + if (strcmp(sysevent_get_subclass_name(ev), ESC_IPMP_PROBE_STATE) != 0) + return (0); + + if (sysevent_get_attr_list(ev, &nvl) != 0) { + warn("sysevent_get_attr_list failed; dropping event"); + return (0); + } + + if (nvlist_lookup_uint32(nvl, IPMP_EVENT_VERSION, &version) != 0) { + warn("dropped event with no IPMP_EVENT_VERSION\n"); + goto out; + } + + if (version != IPMP_EVENT_CUR_VERSION) { + warn("dropped event with unsupported IPMP_EVENT_VERSION %d\n", + version); + goto out; + } + + if (nvlist_lookup_uint32(nvl, IPMP_PROBE_STATE, &state) != 0) { + warn("dropped event with no IPMP_PROBE_STATE\n"); + goto out; + } + + if (state == IPMP_PROBE_ACKED || state == IPMP_PROBE_LOST) + ofmt_output(psp->ps_ofmt, psp->ps_ih, nvl); +out: + nvlist_free(nvl); + return (0); +} + +static void +sfunc_targ_ifname(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + (void) strlcpy(buf, targinfop->it_name, bufsize); +} + +static void +sfunc_targ_mode(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + enum2str(targ_mode, targinfop->it_targmode, buf, bufsize); +} + +static void +sfunc_targ_testaddr(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + ipmp_targinfo_t *targinfop = arg->sa_data; + + if (targinfop->it_targmode != IPMP_TARG_DISABLED) + sockaddr2str(&targinfop->it_testaddr, buf, bufsize); +} + +static void +sfunc_targ_targets(ipmpstat_sfunc_arg_t *arg, char *buf, uint_t bufsize) +{ + uint_t i; + char *targname = alloca(bufsize); + ipmp_targinfo_t *targinfop = arg->sa_data; + ipmp_addrlist_t *targlistp = targinfop->it_targlistp; + + for (i = 0; i < targlistp->al_naddr; i++) { + sockaddr2str(&targlistp->al_addrs[i], targname, bufsize); + (void) strlcat(buf, targname, bufsize); + if ((i + 1) < targlistp->al_naddr) + (void) strlcat(buf, " ", bufsize); + } +} + +static void +info_output_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + ofmt_output(arg, ih, infop); +} + +static void +targinfo_output_cbfunc(ipmp_handle_t ih, void *infop, void *arg) +{ + ipmp_ifinfo_t *ifinfop = infop; + ipmp_if_targmode_t targmode4 = ifinfop->if_targinfo4.it_targmode; + ipmp_if_targmode_t targmode6 = ifinfop->if_targinfo6.it_targmode; + + /* + * Usually, either IPv4 or IPv6 probing will be enabled, but the admin + * may enable both. If only one is enabled, omit the other one so as + * to not encourage the admin to enable both. If neither is enabled, + * we still print one just so the admin can see a MODE of "disabled". + */ + if (targmode4 != IPMP_TARG_DISABLED || targmode6 == IPMP_TARG_DISABLED) + ofmt_output(arg, ih, &ifinfop->if_targinfo4); + if (targmode6 != IPMP_TARG_DISABLED) + ofmt_output(arg, ih, &ifinfop->if_targinfo6); +} + +/* + * Creates an ipmpstat_ofmt_t field list from the comma-separated list of + * user-specified fields passed via `ofields'. The table of known fields + * (and their attributes) is passed via `fields'. + */ +static ipmpstat_ofmt_t * +ofmt_create(const char *ofields, ipmpstat_field_t fields[]) +{ + char *token, *lasts, *ofields_dup; + const char *fieldname; + ipmpstat_ofmt_t *ofmt, *ofmt_head = NULL, *ofmt_tail; + ipmpstat_field_t *fieldp; + uint_t cols = 0; + + /* + * If "-o" was omitted or "-o all" was specified, build a list of + * field names. If "-o" was omitted, stop building the list when + * we run out of columns. + */ + if (ofields == NULL || strcasecmp(ofields, "all") == 0) { + for (fieldp = fields; fieldp->f_name != NULL; fieldp++) { + cols += fieldp->f_width; + if (ofields == NULL && cols > IPMPSTAT_NCOL) + break; + + if ((ofmt = calloc(sizeof (*ofmt), 1)) == NULL) + die("cannot allocate output format list"); + + ofmt->o_field = fieldp; + if (ofmt_head == NULL) { + ofmt_head = ofmt; + ofmt_tail = ofmt; + } else { + ofmt_tail->o_next = ofmt; + ofmt_tail = ofmt; + } + } + return (ofmt_head); + } + + if ((ofields_dup = strdup(ofields)) == NULL) + die("cannot allocate output format list"); + + token = ofields_dup; + while ((fieldname = strtok_r(token, ",", &lasts)) != NULL) { + token = NULL; + + if ((fieldp = field_find(fields, fieldname)) == NULL) { + /* + * Since machine parsers are unlikely to be able to + * gracefully handle missing fields, die if we're in + * parsable mode. Otherwise, just print a warning. + */ + if (opt & IPMPSTAT_OPT_PARSABLE) + die("unknown output field `%s'\n", fieldname); + + warn("ignoring unknown output field `%s'\n", fieldname); + continue; + } + + if ((ofmt = calloc(sizeof (*ofmt), 1)) == NULL) + die("cannot allocate output format list"); + + ofmt->o_field = fieldp; + if (ofmt_head == NULL) { + ofmt_head = ofmt; + ofmt_tail = ofmt; + } else { + ofmt_tail->o_next = ofmt; + ofmt_tail = ofmt; + } + } + + free(ofields_dup); + if (ofmt_head == NULL) + die("no valid output fields specified\n"); + + return (ofmt_head); +} + +/* + * Destroys the provided `ofmt' field list. + */ +static void +ofmt_destroy(ipmpstat_ofmt_t *ofmt) +{ + ipmpstat_ofmt_t *ofmt_next; + + for (; ofmt != NULL; ofmt = ofmt_next) { + ofmt_next = ofmt->o_next; + free(ofmt); + } +} + +/* + * Outputs a header for the fields named by `ofmt'. + */ +static void +ofmt_output_header(const ipmpstat_ofmt_t *ofmt) +{ + const ipmpstat_field_t *fieldp; + + for (; ofmt != NULL; ofmt = ofmt->o_next) { + fieldp = ofmt->o_field; + + if (ofmt->o_next == NULL) + (void) printf("%s", fieldp->f_name); + else + (void) printf("%-*s", fieldp->f_width, fieldp->f_name); + } + (void) printf("\n"); +} + +/* + * Outputs one row of values for the fields named by `ofmt'. The values to + * output are obtained through the `ofmt' function pointers, which are + * indirectly passed the `ih' and `arg' structures for state; see the block + * comment at the start of this file for details. + */ +static void +ofmt_output(const ipmpstat_ofmt_t *ofmt, ipmp_handle_t ih, void *arg) +{ + int i; + char buf[1024]; + boolean_t escsep; + static int nrow; + const char *value; + uint_t width, valwidth; + uint_t compress, overflow = 0; + const ipmpstat_field_t *fieldp; + ipmpstat_sfunc_arg_t sfunc_arg; + + /* + * For each screenful of data, display the header. + */ + if ((nrow++ % winsize.ws_row) == 0 && !(opt & IPMPSTAT_OPT_PARSABLE)) { + ofmt_output_header(ofmt); + nrow++; + } + + /* + * Check if we'll be displaying multiple fields per line, and thus + * need to escape the field separator. + */ + escsep = (ofmt != NULL && ofmt->o_next != NULL); + + for (; ofmt != NULL; ofmt = ofmt->o_next) { + fieldp = ofmt->o_field; + + sfunc_arg.sa_ih = ih; + sfunc_arg.sa_data = arg; + + buf[0] = '\0'; + (*fieldp->f_sfunc)(&sfunc_arg, buf, sizeof (buf)); + + if (opt & IPMPSTAT_OPT_PARSABLE) { + for (i = 0; buf[i] != '\0'; i++) { + if (escsep && (buf[i] == ':' || buf[i] == '\\')) + (void) putchar('\\'); + (void) putchar(buf[i]); + } + if (ofmt->o_next != NULL) + (void) putchar(':'); + } else { + value = (buf[0] == '\0') ? "--" : buf; + + /* + * To avoid needless line-wraps, for the last field, + * don't include any trailing whitespace. + */ + if (ofmt->o_next == NULL) { + (void) printf("%s", value); + continue; + } + + /* + * For other fields, grow the width as necessary to + * ensure the value completely fits. However, if + * there's unused whitespace in subsequent fields, + * then "compress" that whitespace to attempt to get + * the columns to line up again. + */ + width = fieldp->f_width; + valwidth = strlen(value); + + if (valwidth + overflow >= width) { + overflow += valwidth - width + 1; + (void) printf("%s ", value); + continue; + } + + if (overflow > 0) { + compress = MIN(overflow, width - valwidth); + overflow -= compress; + width -= compress; + } + (void) printf("%-*s", width, value); + } + } + (void) printf("\n"); + + /* + * In case stdout has been redirected to e.g. a pipe, flush stdout so + * that commands can act on our output immediately. + */ + (void) fflush(stdout); +} + +/* + * Searches the `fields' array for a field matching `fieldname'. Returns + * a pointer to that field on success, or NULL on failure. + */ +static ipmpstat_field_t * +field_find(ipmpstat_field_t *fields, const char *fieldname) +{ + ipmpstat_field_t *fieldp; + + for (fieldp = fields; fieldp->f_name != NULL; fieldp++) { + if (strcasecmp(fieldp->f_name, fieldname) == 0) + return (fieldp); + } + return (NULL); +} + +/* + * Uses `enums' to map `enumval' to a string, and stores at most `bufsize' + * bytes of that string into `buf'. + */ +static void +enum2str(const ipmpstat_enum_t *enums, int enumval, char *buf, uint_t bufsize) +{ + const ipmpstat_enum_t *enump; + + for (enump = enums; enump->e_name != NULL; enump++) { + if (enump->e_val == enumval) { + (void) strlcpy(buf, enump->e_name, bufsize); + return; + } + } + (void) snprintf(buf, bufsize, "<%d>", enumval); +} + +/* + * Stores the stringified value of the sockaddr_storage pointed to by `ssp' + * into at most `bufsize' bytes of `buf'. + */ +static void +sockaddr2str(const struct sockaddr_storage *ssp, char *buf, uint_t bufsize) +{ + int flags = NI_NOFQDN; + socklen_t socklen; + struct sockaddr *sp = (struct sockaddr *)ssp; + + /* + * Sadly, getnameinfo() does not allow the socklen to be oversized for + * a given family -- so we must determine the exact size to pass to it. + */ + switch (ssp->ss_family) { + case AF_INET: + socklen = sizeof (struct sockaddr_in); + break; + case AF_INET6: + socklen = sizeof (struct sockaddr_in6); + break; + default: + (void) strlcpy(buf, "?", bufsize); + return; + } + + if (opt & IPMPSTAT_OPT_NUMERIC) + flags |= NI_NUMERICHOST; + + (void) getnameinfo(sp, socklen, buf, bufsize, NULL, 0, flags); +} + +static void +sighandler(int sig) +{ + assert(sig == SIGWINCH); + + if (ioctl(1, TIOCGWINSZ, &winsize) == -1 || + winsize.ws_col == 0 || winsize.ws_row == 0) { + winsize.ws_col = 80; + winsize.ws_row = 24; + } +} + +static void +usage(void) +{ + const char *argstr = gettext("[-n] [-o <field> [-P]] -a|-g|-i|-p|-t"); + + (void) fprintf(stderr, gettext("usage: %s %s\n"), progname, argstr); + exit(EXIT_FAILURE); +} + +/* PRINTFLIKE1 */ +static void +warn(const char *format, ...) +{ + va_list alist; + int error = errno; + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", strerror(error)); +} + +/* PRINTFLIKE2 */ +static void +warn_ipmperr(int ipmperr, const char *format, ...) +{ + va_list alist; + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + (void) fprintf(stderr, ": %s\n", ipmp_errmsg(ipmperr)); +} + +/* PRINTFLIKE1 */ +static void +die(const char *format, ...) +{ + va_list alist; + int error = errno; + + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + if (strchr(format, '\n') == NULL) + (void) fprintf(stderr, ": %s\n", strerror(error)); + + exit(EXIT_FAILURE); +} + +/* PRINTFLIKE2 */ +static void +die_ipmperr(int ipmperr, const char *format, ...) +{ + va_list alist; + + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + (void) fprintf(stderr, ": %s\n", ipmp_errmsg(ipmperr)); + + exit(EXIT_FAILURE); +} + +static ipmpstat_field_t addr_fields[] = { + { "ADDRESS", 26, sfunc_addr_address }, + { "STATE", 7, sfunc_addr_state }, + { "GROUP", 12, sfunc_addr_group }, + { "INBOUND", 12, sfunc_addr_inbound }, + { "OUTBOUND", 23, sfunc_addr_outbound }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t group_fields[] = { + { "GROUP", 12, sfunc_group_ifname }, + { "GROUPNAME", 12, sfunc_group_name }, + { "STATE", 10, sfunc_group_state }, + { "FDT", 10, sfunc_group_fdt }, + { "INTERFACES", 30, sfunc_group_interfaces }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t if_fields[] = { + { "INTERFACE", 12, sfunc_if_name }, + { "ACTIVE", 8, sfunc_if_active }, + { "GROUP", 12, sfunc_if_group }, + { "FLAGS", 10, sfunc_if_flags }, + { "LINK", 10, sfunc_if_link }, + { "PROBE", 10, sfunc_if_probe }, + { "STATE", 10, sfunc_if_state }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t probe_fields[] = { + { "TIME", 10, sfunc_probe_time }, + { "INTERFACE", 12, sfunc_probe_ifname }, + { "PROBE", 7, sfunc_probe_id }, + { "NETRTT", 10, sfunc_probe_netrtt }, + { "RTT", 10, sfunc_probe_rtt }, + { "RTTAVG", 10, sfunc_probe_rttavg }, + { "TARGET", 20, sfunc_probe_target }, + { "RTTDEV", 10, sfunc_probe_rttdev }, + { NULL, 0, NULL } +}; + +static ipmpstat_field_t targ_fields[] = { + { "INTERFACE", 12, sfunc_targ_ifname }, + { "MODE", 10, sfunc_targ_mode }, + { "TESTADDR", 20, sfunc_targ_testaddr }, + { "TARGETS", 38, sfunc_targ_targets }, + { NULL, 0, NULL } +}; + +static ipmpstat_enum_t addr_state[] = { + { "up", IPMP_ADDR_UP }, + { "down", IPMP_ADDR_DOWN }, + { NULL, 0 } +}; + +static ipmpstat_enum_t group_state[] = { + { "ok", IPMP_GROUP_OK }, + { "failed", IPMP_GROUP_FAILED }, + { "degraded", IPMP_GROUP_DEGRADED }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_link[] = { + { "up", IPMP_LINK_UP }, + { "down", IPMP_LINK_DOWN }, + { "unknown", IPMP_LINK_UNKNOWN }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_probe[] = { + { "ok", IPMP_PROBE_OK }, + { "failed", IPMP_PROBE_FAILED }, + { "unknown", IPMP_PROBE_UNKNOWN }, + { "disabled", IPMP_PROBE_DISABLED }, + { NULL, 0 } +}; + +static ipmpstat_enum_t if_state[] = { + { "ok", IPMP_IF_OK }, + { "failed", IPMP_IF_FAILED }, + { "unknown", IPMP_IF_UNKNOWN }, + { "offline", IPMP_IF_OFFLINE }, + { NULL, 0 } +}; + +static ipmpstat_enum_t targ_mode[] = { + { "disabled", IPMP_TARG_DISABLED }, + { "routes", IPMP_TARG_ROUTES }, + { "multicast", IPMP_TARG_MULTICAST }, + { NULL, 0 } +}; diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl new file mode 100644 index 0000000000..e2398aaf64 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipmpstat/ipmpstat.xcl @@ -0,0 +1,106 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +msgid " " +msgid "%-*s" +msgid "%.2fms" +msgid "%.2fs" +msgid "%d-%s" +msgid "%s" +msgid "%s " +msgid "%s: " +msgid "%u" +msgid "(" +msgid ")" +msgid "," +msgid "--" +msgid ": %s\n" +msgid "?" +msgid "[" +msgid "]" +msgid "<%d>" +msgid "\n" +msgid "ACTIVE" +msgid "ADDRESS" +msgid "EC_ipmp" +msgid "ESC_ipmp_probe_state" +msgid "FDT" +msgid "FLAGS" +msgid "GROUP" +msgid "GROUPNAME" +msgid "INBOUND" +msgid "INTERFACE" +msgid "INTERFACES" +msgid "IPMP_IF_NAME" +msgid "IPMP_PROBE_ACKPROC_TIME" +msgid "IPMP_PROBE_ACKRECV_TIME" +msgid "IPMP_PROBE_ID" +msgid "IPMP_PROBE_SENT_TIME" +msgid "IPMP_PROBE_START_TIME" +msgid "IPMP_PROBE_STATE" +msgid "IPMP_PROBE_TARGET" +msgid "IPMP_PROBE_TARGET_RTTAVG" +msgid "IPMP_PROBE_TARGET_RTTDEV" +msgid "LINK" +msgid "MODE" +msgid "NETRTT" +msgid "OUTBOUND" +msgid "PROBE" +msgid "RTT" +msgid "RTTAVG" +msgid "RTTDEV" +msgid "STATE" +msgid "TARGET" +msgid "TARGETS" +msgid "TESTADDR" +msgid "TIME" +msgid "agipt" +msgid "all" +msgid "bufsize > IPMPSTAT_NUM_FLAGS" +msgid "com.sun:ipmp:events" +msgid "degraded" +msgid "disabled" +msgid "down" +msgid "failed" +msgid "ipmp_event_version" +msgid "ipmp_if_name" +msgid "ipmp_probe_ackproc_time" +msgid "ipmp_probe_ackrecv_time" +msgid "ipmp_probe_id" +msgid "ipmp_probe_sent_time" +msgid "ipmp_probe_start_time" +msgid "ipmp_probe_state" +msgid "ipmp_probe_target" +msgid "ipmp_probe_target_rttavg" +msgid "ipmp_probe_target_rttdev" +msgid "ipmpstat.c" +msgid "multicast" +msgid "nLPo:agipt" +msgid "no" +msgid "offline" +msgid "ok" +msgid "routes" +msgid "sig == SIGWINCH" +msgid "unknown" +msgid "up" +msgid "yes" diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types b/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types index bb15199492..e42bc626d8 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types +++ b/usr/src/cmd/cmd-inet/usr.sbin/ipqosconf/ipgpc.types @@ -1,13 +1,12 @@ # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -23,15 +22,12 @@ # CDDL HEADER END # -#pragma ident "%Z%%M% %I% %E% SMI" - fmt_version 1.0 mod_version 1.0 #PERM_CLASS default filter name string -filter if_groupname string filter user user filter projid int32 filter if_name ifname diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c index 17891ffc78..2a4ff60d57 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c @@ -18,7 +18,7 @@ * * CDDL HEADER END * - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,8 +37,6 @@ * contributors. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <strings.h> #include <errno.h> @@ -243,7 +241,7 @@ main(int argc, char *argv[]) ushort_t udp_src_port6; /* used to identify replies */ uint_t flowinfo = 0; uint_t class = 0; - char tmp_buf[INET6_ADDRSTRLEN]; + char abuf[INET6_ADDRSTRLEN]; int c; int i; boolean_t has_sys_ip_config; @@ -671,24 +669,18 @@ main(int argc, char *argv[]) Printf("PING %s: %d data bytes\n", targethost, datalen); } else { if (ai_dst->ai_family == AF_INET) { - Printf("PING %s (%s): %d data bytes\n", - targethost, - inet_ntop(AF_INET, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in *) - ai_dst->ai_addr)->sin_addr, - tmp_buf, sizeof (tmp_buf)), - datalen); + (void) inet_ntop(AF_INET, + &((struct sockaddr_in *)(void *) + ai_dst->ai_addr)->sin_addr, + abuf, sizeof (abuf)); } else { - Printf("PING %s (%s): %d data bytes\n", - targethost, - inet_ntop(AF_INET6, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in6 *) - ai_dst->ai_addr)->sin6_addr, - tmp_buf, sizeof (tmp_buf)), - datalen); + (void) inet_ntop(AF_INET6, + &((struct sockaddr_in6 *)(void *) + ai_dst->ai_addr)->sin6_addr, + abuf, sizeof (abuf)); } + Printf("PING %s (%s): %d data bytes\n", + targethost, abuf, datalen); } } @@ -1074,12 +1066,12 @@ select_all_src_addrs(union any_in_addr **src_addr_list, struct addrinfo *ai, int num_dst = 1; int i; - if (probe_all) - for (aip = ai; aip->ai_next != NULL; - aip = aip->ai_next, num_dst++); + if (probe_all) { + for (aip = ai; aip->ai_next != NULL; aip = aip->ai_next) + num_dst++; + } - list = (union any_in_addr *) - calloc((size_t)num_dst, sizeof (union any_in_addr)); + list = calloc((size_t)num_dst, sizeof (union any_in_addr)); if (list == NULL) { Fprintf(stderr, "%s: calloc: %s\n", progname, strerror(errno)); exit(EXIT_FAILURE); @@ -1472,7 +1464,7 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, int i; /* pull out the interface list */ - num_ifs = ifaddrlist(&al, family, errbuf); + num_ifs = ifaddrlist(&al, family, LIFC_UNDER_IPMP, errbuf); if (num_ifs == -1) { Fprintf(stderr, "%s: %s\n", progname, errbuf); exit(EXIT_FAILURE); @@ -1699,8 +1691,8 @@ send_scheduled_probe() } else { Printf("no answer from %s(%s)\n", targethost, inet_ntop(current_targetaddr->family, - ¤t_targetaddr->dst_addr, - tmp_buf, sizeof (tmp_buf))); + ¤t_targetaddr->dst_addr, + tmp_buf, sizeof (tmp_buf))); } } /* @@ -1736,9 +1728,8 @@ send_scheduled_probe() * Each time we move to a new targetaddr, which has * a different target IP address, we update this field. */ - current_targetaddr->starting_seq_num = - use_udp ? dest_port : - (ntransmitted % (MAX_ICMP_SEQ + 1)); + current_targetaddr->starting_seq_num = use_udp ? + dest_port : (ntransmitted % (MAX_ICMP_SEQ + 1)); } } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c index f062247997..e5b23fa126 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -139,7 +139,7 @@ check_device(dlpi_handle_t *dhp, char **devicep) if (ioctl(s, SIOCGIFFLAGS, (char *)ifr) < 0) pr_err("ioctl SIOCGIFFLAGS"); if ((ifr->ifr_flags & - (IFF_VIRTUAL|IFF_LOOPBACK|IFF_UP| + (IFF_VIRTUAL|IFF_IPMP|IFF_UP| IFF_RUNNING)) == (IFF_UP|IFF_RUNNING)) break; } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c index adc6a932b0..cae75df60d 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,8 +27,6 @@ * @(#)$Header: traceroute.c,v 1.49 97/06/13 02:30:23 leres Exp $ (LBL) */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/param.h> #include <sys/file.h> #include <sys/ioctl.h> @@ -707,7 +705,7 @@ get_hostinfo(char *host, int family, struct addrinfo **aipp) struct addrinfo hints, *ai; struct in6_addr addr6; struct in_addr addr; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int rc; /* @@ -720,11 +718,10 @@ get_hostinfo(char *host, int family, struct addrinfo **aipp) IN6_V4MAPPED_TO_INADDR(&addr6, &addr); /* convert it back to a string */ - (void) inet_ntop(AF_INET, (void *)&addr, temp_buf, - sizeof (temp_buf)); + (void) inet_ntop(AF_INET, &addr, abuf, sizeof (abuf)); /* now the host is an IPv4 address */ - (void) strcpy(host, temp_buf); + (void) strcpy(host, abuf); /* * If it's a mapped address, we convert it into IPv4 @@ -826,15 +823,19 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) struct sockaddr_in6 *sin6_from = (struct sockaddr_in6 *)pr->from; struct addrinfo *aip; char errbuf[ERRBUFSIZE]; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int num_ifs; /* all the interfaces */ int num_src_ifs; /* exclude loopback and down */ int i; + uint_t ifaddrflags = 0; source = source_input; + if (device != NULL) + ifaddrflags |= LIFC_UNDER_IPMP; + /* get the interface address list */ - num_ifs = ifaddrlist(&al, pr->family, errbuf); + num_ifs = ifaddrlist(&al, pr->family, ifaddrflags, errbuf); if (num_ifs < 0) { Fprintf(stderr, "%s: ifaddrlist: %s\n", prog, errbuf); exit(EXIT_FAILURE); @@ -881,26 +882,20 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) if (pr->family == AF_INET) ap = (union any_in_addr *) /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in *) - aip->ai_addr)->sin_addr; + &((struct sockaddr_in *)aip->ai_addr)->sin_addr; else ap = (union any_in_addr *) /* LINTED E_BAD_PTR_CAST_ALIGN */ - &((struct sockaddr_in6 *) - aip->ai_addr)->sin6_addr; + &((struct sockaddr_in6 *)aip->ai_addr)->sin6_addr; /* * LBNL bug fixed: used to accept any src address */ tmp2_al = find_ifaddr(al, num_ifs, ap, pr->family); - if (tmp2_al == NULL) { - Fprintf(stderr, - "%s: %s is not a local %s address\n", - prog, inet_ntop(pr->family, ap, - temp_buf, sizeof (temp_buf)), - pr->name); - + (void) inet_ntop(pr->family, ap, abuf, sizeof (abuf)); + Fprintf(stderr, "%s: %s is not a local %s address\n", + prog, abuf, pr->name); free(al); freeaddrinfo(aip); return (0); @@ -928,13 +923,11 @@ set_src_addr(struct pr_set *pr, struct ifaddrlist **alp) set_sin(pr->from, ap, pr->family); if (aip->ai_next != NULL) { - Fprintf(stderr, - "%s: Warning: %s has multiple " - "addresses; using %s\n", - prog, source, - inet_ntop(pr->family, - (const void *)pr->from_sin_addr, - temp_buf, sizeof (temp_buf))); + (void) inet_ntop(pr->family, pr->from_sin_addr, + abuf, sizeof (abuf)); + Fprintf(stderr, "%s: Warning: %s has multiple " + "addresses; using %s\n", prog, source, + abuf); } } else { /* -i and -s used */ /* @@ -1484,7 +1477,7 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, uchar_t code; /* icmp code */ int reply; int seq = 0; - char temp_buf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ + char abuf[INET6_ADDRSTRLEN]; /* use for inet_ntop() */ int longjmp_return; /* return value from longjump */ struct ip *ip = (struct ip *)packet; boolean_t got_there = _B_FALSE; /* we hit the destination */ @@ -1535,13 +1528,11 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, if (dev_name == NULL) dev_name = "?"; + (void) inet_ntop(pr->family, pr->from_sin_addr, abuf, + sizeof (abuf)); Fprintf(stderr, "%s: Warning: Multiple interfaces found;" - " using %s @ %s\n", - prog, inet_ntop(pr->family, - (const void *)pr->from_sin_addr, - temp_buf, sizeof (temp_buf)), - dev_name); + " using %s @ %s\n", prog, abuf, dev_name); } } @@ -1558,8 +1549,7 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, Fprintf(stderr, "%s to %s", prog, hostname); } else { Fprintf(stderr, "%s to %s (%s)", prog, hostname, - inet_ntop(pr->family, (const void *)ip_addr, temp_buf, - sizeof (temp_buf))); + inet_ntop(pr->family, ip_addr, abuf, sizeof (abuf))); } if (source) @@ -1700,9 +1690,8 @@ traceroute(union any_in_addr *ip_addr, struct msghdr *msg6, struct pr_set *pr, } if (pr->family == AF_INET6) { - intp = - (int *)find_ancillary_data(&in_msg, - IPPROTO_IPV6, IPV6_HOPLIMIT); + intp = find_ancillary_data(&in_msg, + IPPROTO_IPV6, IPV6_HOPLIMIT); if (intp == NULL) { Fprintf(stderr, "%s: can't find " @@ -2188,10 +2177,11 @@ static void usage(void) { Fprintf(stderr, "Usage: %s [-adFIlnSvx] [-A address_family] " -"[-c traffic_class] \n" -"\t[-f first_hop] [-g gateway [-g gateway ...]| -r] [-i iface]\n" -"\t[-L flow_label] [-m max_hop] [-P pause_sec] [-p port] [-Q max_timeout]\n" -"\t[-q nqueries] [-s src_addr] [-t tos] [-w wait_time] host [packetlen]\n", - prog); + "[-c traffic_class]\n" + "\t[-f first_hop] [-g gateway [-g gateway ...]| -r] [-i iface]\n" + "\t[-L flow_label] [-m max_hop] [-P pause_sec] [-p port] " + "[-Q max_timeout]\n" + "\t[-q nqueries] [-s src_addr] [-t tos] [-w wait_time] host " + "[packetlen]\n", prog); exit(EXIT_FAILURE); } diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c index c72be6be37..44756c3e98 100644 --- a/usr/src/cmd/devfsadm/misc_link.c +++ b/usr/src/cmd/devfsadm/misc_link.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -104,7 +104,7 @@ static devfsadm_create_t misc_cbt[] = { "(^ip$)|(^tcp$)|(^udp$)|(^icmp$)|(^sctp$)|" "(^ip6$)|(^tcp6$)|(^udp6$)|(^icmp6$)|(^sctp6$)|" "(^rts$)|(^arp$)|(^ipsecah$)|(^ipsecesp$)|(^keysock$)|(^spdsock$)|" - "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)", + "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)", TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name }, { "pseudo", "ddi_pseudo", diff --git a/usr/src/cmd/mdb/common/modules/ip/ip.c b/usr/src/cmd/mdb/common/modules/ip/ip.c index f2dadd5261..f064b58d83 100644 --- a/usr/src/cmd/mdb/common/modules/ip/ip.c +++ b/usr/src/cmd/mdb/common/modules/ip/ip.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stropts.h> #include <sys/stream.h> @@ -524,8 +522,7 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg) static const mdb_bitmask_t mmasks[] = { { "CONDEMNED", IRE_MARK_CONDEMNED, IRE_MARK_CONDEMNED }, - { "NORECV", IRE_MARK_NORECV, IRE_MARK_NORECV }, - { "HIDDEN", IRE_MARK_HIDDEN, IRE_MARK_HIDDEN }, + { "TESTHIDDEN", IRE_MARK_TESTHIDDEN, IRE_MARK_TESTHIDDEN }, { "NOADD", IRE_MARK_NOADD, IRE_MARK_NOADD }, { "TEMPORARY", IRE_MARK_TEMPORARY, IRE_MARK_TEMPORARY }, { "USESRC", IRE_MARK_USESRC_CHECK, IRE_MARK_USESRC_CHECK }, diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com index 365371c45c..dbe3c1f1d1 100644 --- a/usr/src/cmd/rcm_daemon/Makefile.com +++ b/usr/src/cmd/rcm_daemon/Makefile.com @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -124,7 +124,7 @@ SUNW_network_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_vlan_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_vnic_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_aggr_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm -SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm +SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm -lipmp SUNW_ip_anon_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil LDLIBS += -lgen -lelf -lrcm -lnvpair -ldevinfo -lnsl -lsocket diff --git a/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c index be9a31f952..6e1fe1bf39 100644 --- a/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c +++ b/usr/src/cmd/rcm_daemon/common/ip_anon_rcm.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * RCM module to prevent plumbed IP addresses from being removed. */ @@ -177,7 +175,7 @@ ip_anon_register(rcm_handle_t *hdl) if (_cladm(CL_INITIALIZE, CL_GET_BOOTFLAG, &bootflags) != 0) { rcm_log_message(RCM_ERROR, - gettext("unable to check cluster status\n")); + gettext("unable to check cluster status\n")); (void) mutex_unlock(&ip_list_lock); return (RCM_FAILURE); } @@ -199,7 +197,7 @@ ip_anon_register(rcm_handle_t *hdl) else { if ((exclude_addrs.cladm_netaddrs_array = malloc(sizeof (cladm_netaddr_entry_t) * - (num_exclude_addrs))) == NULL) { + (num_exclude_addrs))) == NULL) { rcm_log_message(RCM_ERROR, gettext("out of memory\n")); (void) mutex_unlock(&ip_list_lock); @@ -274,7 +272,7 @@ ip_anon_register(rcm_handle_t *hdl) rcm_log_message(RCM_DEBUG, "ip_anon: obtaining list of IPv4 addresses.\n"); - num_ifs = ifaddrlist(&al, AF_INET, errbuf); + num_ifs = ifaddrlist(&al, AF_INET, LIFC_UNDER_IPMP, errbuf); if (num_ifs == -1) { rcm_log_message(RCM_ERROR, gettext("cannot get IPv4 address list errno=%d (%s)\n"), @@ -286,7 +284,7 @@ ip_anon_register(rcm_handle_t *hdl) rcm_log_message(RCM_DEBUG, "ip_anon: obtaining list of IPv6 addresses.\n"); - num_ifs6 = ifaddrlist(&al6, AF_INET6, errbuf); + num_ifs6 = ifaddrlist(&al6, AF_INET6, LIFC_UNDER_IPMP, errbuf); if (num_ifs6 == -1) { rcm_log_message(RCM_ERROR, gettext("cannot get IPv6 address list errno=%d (%s)\n"), @@ -392,7 +390,7 @@ ip_anon_register(rcm_handle_t *hdl) * currently know about it. */ if (!(tentry->flags & IP_FLAG_CL) && - !(tentry->flags & IP_FLAG_REG)) { + !(tentry->flags & IP_FLAG_REG)) { tentry->flags |= IP_FLAG_REG; rcm_log_message(RCM_DEBUG, "ip_anon: registering interest in %s\n", diff --git a/usr/src/cmd/rcm_daemon/common/ip_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_rcm.c index f62b3dfc19..24be0cafeb 100644 --- a/usr/src/cmd/rcm_daemon/common/ip_rcm.c +++ b/usr/src/cmd/rcm_daemon/common/ip_rcm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,23 +38,22 @@ #include <errno.h> #include <fcntl.h> #include <sys/types.h> +#include <sys/wait.h> #include <sys/stat.h> #include <sys/socket.h> #include <sys/sockio.h> #include <net/if.h> #include <netinet/in.h> -#include <netinet/tcp.h> #include <arpa/inet.h> #include <stropts.h> #include <strings.h> -#include <libdevinfo.h> -#include <sys/systeminfo.h> -#include <netdb.h> +#include <sys/sysmacros.h> #include <inet/ip.h> #include <libinetutil.h> #include <libdllink.h> +#include <libgen.h> +#include <ipmp_admin.h> -#include <ipmp_mpathd.h> #include "rcm_module.h" /* @@ -75,42 +74,19 @@ #define RCM_LINK_RESOURCE_MAX (13 + LINKID_STR_WIDTH) #define RCM_STR_SUNW_IP "SUNW_ip/" /* IP address export prefix */ -#define RCM_SIZE_SUNW_IP 9 /* strlen("SUNW_ip/") + 1 */ -/* ifconfig(1M) */ -#define USR_SBIN_IFCONFIG "/usr/sbin/ifconfig" /* ifconfig command */ -#define CFGFILE_FMT_IPV4 "/etc/hostname." /* IPV4 config file */ -#define CFGFILE_FMT_IPV6 "/etc/hostname6." /* IPV6 config file */ +#define SBIN_IFCONFIG "/sbin/ifconfig" /* ifconfig command */ +#define SBIN_IFPARSE "/sbin/ifparse" /* ifparse command */ +#define DHCPFILE_FMT "/etc/dhcp.%s" /* DHCP config file */ +#define CFGFILE_FMT_IPV4 "/etc/hostname.%s" /* IPV4 config file */ +#define CFGFILE_FMT_IPV6 "/etc/hostname6.%s" /* IPV6 config file */ #define CFG_CMDS_STD " netmask + broadcast + up" /* Normal config string */ -#define CONFIG_AF_INET 0x1 /* Post-configure IPv4 */ -#define CONFIG_AF_INET6 0x2 /* Post-configure IPv6 */ -#define MAXLINE 1024 /* Max. line length */ -#define MAXARGS 512 /* Max. args in ifconfig cmd */ - -/* Physical interface flags mask */ -#define RCM_PIF_FLAGS (IFF_OFFLINE | IFF_INACTIVE | IFF_FAILED | \ - IFF_STANDBY) +#define CFG_DHCP_CMD "dhcp wait 0" /* command to start DHCP */ /* Some useful macros */ -#ifndef MAX -#define MAX(a, b) (((a) > (b))?(a):(b)) -#endif /* MAX */ - -#ifndef ISSPACE #define ISSPACE(c) ((c) == ' ' || (c) == '\t') -#endif - -#ifndef ISEOL #define ISEOL(c) ((c) == '\n' || (c) == '\r' || (c) == '\0') -#endif - -#ifndef STREQ #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) -#endif - -#ifndef ADDSPACE -#define ADDSPACE(a) ((void) strcat((a), " ")) -#endif /* Interface Cache state flags */ #define CACHE_IF_STALE 0x1 /* stale cached data */ @@ -125,48 +101,20 @@ /* RCM IPMP Module specific property definitions */ #define RCM_IPMP_MIN_REDUNDANCY 1 /* default min. redundancy */ -/* in.mpathd(1M) specifics */ -#define MPATHD_MAX_RETRIES 5 /* Max. offline retries */ - /* Stream module operations */ #define MOD_INSERT 0 /* Insert a mid-stream module */ #define MOD_REMOVE 1 /* Remove a mid-stream module */ #define MOD_CHECK 2 /* Check mid-stream module safety */ /* - * in.mpathd(1M) message passing formats - */ -typedef struct mpathd_cmd { - uint32_t cmd_command; /* message command */ - char cmd_ifname[LIFNAMSIZ]; /* this interface name */ - char cmd_movetoif[LIFNAMSIZ]; /* move to interface */ - uint32_t cmd_min_red; /* min. redundancy */ -/* Message passing values for MI_SETOINDEX */ -#define from_lifname cmd_ifname /* current logical interface */ -#define to_pifname cmd_movetoif /* new physical interface */ -#define addr_family cmd_min_red /* address family */ -} mpathd_cmd_t; - -/* This is needed since mpathd checks message size for offline */ -typedef struct mpathd_unoffline { - uint32_t cmd_command; /* offline / undo offline */ - char cmd_ifname[LIFNAMSIZ]; /* this interface name */ -} mpathd_unoffline_t; - -typedef struct mpathd_response { - uint32_t resp_sys_errno; /* system errno */ - uint32_t resp_mpathd_err; /* mpathd error information */ -} mpathd_response_t; - -/* * IP module data types */ /* Physical interface representation */ typedef struct ip_pif { - char pi_ifname[LIFNAMSIZ+1]; /* interface name */ - char pi_grpname[LIFNAMSIZ+1]; /* IPMP group name */ - struct ip_lif *pi_lifs; /* ptr to logical interfaces */ + char pi_ifname[LIFNAMSIZ]; /* interface name */ + char pi_grname[LIFGRNAMSIZ]; /* IPMP group name */ + struct ip_lif *pi_lifs; /* ptr to logical interfaces */ } ip_pif_t; /* Logical interface representation */ @@ -239,7 +187,7 @@ static void free_node(ip_cache_t *); static void cache_insert(ip_cache_t *); static char *ip_usage(ip_cache_t *); static int update_pif(rcm_handle_t *, int, int, struct lifreq *); -static int ip_ipmp_offline(ip_cache_t *, ip_cache_t *); +static int ip_ipmp_offline(ip_cache_t *); static int ip_ipmp_undo_offline(ip_cache_t *); static int if_cfginfo(ip_cache_t *, uint_t); static int if_unplumb(ip_cache_t *); @@ -247,9 +195,6 @@ static int if_replumb(ip_cache_t *); static void ip_log_err(ip_cache_t *, char **, char *); static char *get_link_resource(const char *); static void clr_cfg_state(ip_pif_t *); -static uint64_t if_get_flags(ip_pif_t *); -static int mpathd_send_cmd(mpathd_cmd_t *); -static int connect_to_mpathd(int); static int modop(char *, char *, int, char); static int get_modlist(char *, ip_lif_t *); static int ip_domux2fd(int *, int *, int *, struct lifreq *); @@ -262,15 +207,13 @@ static char **ip_get_addrlist(ip_cache_t *); static void ip_free_addrlist(char **); static void ip_consumer_notify(rcm_handle_t *, datalink_id_t, char **, uint_t, rcm_info_t **); +static boolean_t ip_addrstr(ip_lif_t *, char *, size_t); static int if_configure(datalink_id_t); -static int isgrouped(char *); -static int if_ipmp_config(char *, int, int); -static int if_mpathd_configure(char *, char *, int, int); -static char *get_mpathd_dest(char *, int); -static int if_getcount(int); -static void tokenize(char *, char **, char *, int *); - +static boolean_t isgrouped(const char *); +static int if_config_inst(const char *, FILE *, int, boolean_t); +static uint_t ntok(const char *cp); +static boolean_t ifconfig(const char *, const char *, const char *, boolean_t); /* Module-Private data */ static struct rcm_mod_ops ip_ops = @@ -429,9 +372,9 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, { ip_cache_t *node; ip_pif_t *pif; - int detachable = 0; - int nofailover = 0; - int ipmp = 0; + boolean_t detachable = B_FALSE; + boolean_t ipmp; + int retval; rcm_log_message(RCM_TRACE1, "IP: offline(%s)\n", rsrc); @@ -455,25 +398,17 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, pif = node->ip_pif; /* Establish default detachability criteria */ - if (flags & RCM_FORCE) { - detachable++; - } + if (flags & RCM_FORCE) + detachable = B_TRUE; - /* Check if the interface is an IPMP grouped interface */ - if (strcmp(pif->pi_grpname, "")) { - ipmp++; - } - - if (if_get_flags(pif) & IFF_NOFAILOVER) { - nofailover++; - } + /* Check if the interface is under IPMP */ + ipmp = (pif->pi_grname[0] != '\0'); /* - * Even if the interface is not in an IPMP group, it's possible that - * it's still okay to offline it as long as there are higher-level - * failover mechanisms for the addresses it owns (e.g., clustering). - * In this case, ip_offlinelist() will return RCM_SUCCESS, and we - * charge on. + * Even if the interface is not under IPMP, it's possible that it's + * still okay to offline it as long as there are higher-level failover + * mechanisms for the addresses it owns (e.g., clustering). In this + * case, ip_offlinelist() will return RCM_SUCCESS, and we charge on. */ if (!ipmp && !detachable) { /* Inform consumers of IP addresses being offlined */ @@ -489,17 +424,6 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, } } - /* - * Cannot remove an IPMP interface if IFF_NOFAILOVER is set. - */ - if (ipmp && nofailover) { - /* Interface is part of an IPMP group, and cannot failover */ - ip_log_err(node, errorp, "Failover disabled"); - errno = EBUSY; - (void) mutex_unlock(&cache_lock); - return (RCM_FAILURE); - } - /* Check if it's a query */ if (flags & RCM_QUERY) { rcm_log_message(RCM_TRACE1, "IP: offline query success(%s)\n", @@ -534,38 +458,32 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, } /* - * This an IPMP interface that can be failed over. - * Request in.mpathd(1M) to failover the physical interface. + * This is an IPMP interface that can be offlined. + * Request in.mpathd(1M) to offline the physical interface. */ + if ((retval = ip_ipmp_offline(node)) != IPMP_SUCCESS) + ip_log_err(node, errorp, "in.mpathd offline failed"); - /* Failover to "any", let mpathd determine best failover candidate */ - if (ip_ipmp_offline(node, NULL) < 0) { - ip_log_err(node, errorp, "in.mpathd failover failed"); + if (retval == IPMP_EMINRED && !detachable) { /* - * Odds are that in.mpathd(1M) could not offline the device - * because it was the last interface in the group. However, - * it's possible that it's still okay to offline it as long as - * there are higher-level failover mechanisms for the - * addresses it owns (e.g., clustering). In this case, - * ip_offlinelist() will return RCM_SUCCESS, and we charge on. - * - * TODO: change ip_ipmp_offline() to return the actual failure - * from in.mpathd so that we can verify that it did indeed - * fail with IPMP_EMINRED. + * in.mpathd(1M) could not offline the device because it was + * the last interface in the group. However, it's possible + * that it's still okay to offline it as long as there are + * higher-level failover mechanisms for the addresses it owns + * (e.g., clustering). In this case, ip_offlinelist() will + * return RCM_SUCCESS, and we charge on. */ - if (!detachable) { - /* Inform consumers of IP addresses being offlined */ - if (ip_offlinelist(hd, node, errorp, flags, - depend_info) == RCM_SUCCESS) { - rcm_log_message(RCM_DEBUG, - "IP: consumers agree on detach"); - } else { - ip_log_err(node, errorp, - "Device consumers prohibit offline"); - (void) mutex_unlock(&cache_lock); - errno = EBUSY; - return (RCM_FAILURE); - } + /* Inform consumers of IP addresses being offlined */ + if (ip_offlinelist(hd, node, errorp, flags, + depend_info) == RCM_SUCCESS) { + rcm_log_message(RCM_DEBUG, + "IP: consumers agree on detach"); + } else { + ip_log_err(node, errorp, + "Device consumers prohibit offline"); + (void) mutex_unlock(&cache_lock); + errno = EBUSY; + return (RCM_FAILURE); } } @@ -574,8 +492,8 @@ ip_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, _("IP: Unplumb failed (%s)\n"), pif->pi_ifname); - /* Request mpathd to undo the offline */ - if (ip_ipmp_undo_offline(node) < 0) { + /* Request in.mpathd to undo the offline */ + if (ip_ipmp_undo_offline(node) != IPMP_SUCCESS) { ip_log_err(node, errorp, "Undo offline failed"); (void) mutex_unlock(&cache_lock); return (RCM_FAILURE); @@ -862,18 +780,16 @@ static char * ip_usage(ip_cache_t *node) { ip_lif_t *lif; - int numifs; - char *buf; - char *linkidstr; + uint_t numup; + char *sep, *buf, *linkidstr; datalink_id_t linkid; - const char *fmt; - char *sep; + const char *msg; char link[MAXLINKNAMELEN]; char addrstr[INET6_ADDRSTRLEN]; char errmsg[DLADM_STRSIZE]; dladm_status_t status; - int offline = 0; - size_t bufsz; + boolean_t offline, ipmp; + size_t bufsz = 0; rcm_log_message(RCM_TRACE2, "IP: usage(%s)\n", node->ip_resource); @@ -904,76 +820,53 @@ ip_usage(ip_cache_t *node) /* TRANSLATION_NOTE: separator used between IP addresses */ sep = _(", "); - numifs = 0; - for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) { - if (lif->li_ifflags & IFF_UP) { - numifs++; - } - } + numup = 0; + for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) + if (lif->li_ifflags & IFF_UP) + numup++; - if (node->ip_cachestate & CACHE_IF_OFFLINED) { - offline++; - } + ipmp = (node->ip_pif->pi_grname[0] != '\0'); + offline = ((node->ip_cachestate & CACHE_IF_OFFLINED) != 0); - if (!offline && numifs) { - fmt = _("%1$s hosts IP addresses: "); - } else if (offline) { - fmt = _("%1$s offlined"); + if (offline) { + msg = _("offlined"); + } else if (numup == 0) { + msg = _("plumbed but down"); } else { - fmt = _("%1$s plumbed but down"); + if (ipmp) { + msg = _("providing connectivity for IPMP group "); + bufsz += LIFGRNAMSIZ; + } else { + msg = _("hosts IP addresses: "); + bufsz += (numup * (INET6_ADDRSTRLEN + strlen(sep))); + } } - /* space for addresses and separators, plus message */ - bufsz = ((numifs * (INET6_ADDRSTRLEN + strlen(sep))) + - strlen(fmt) + strlen(link) + 1); + bufsz += strlen(link) + strlen(msg) + 1; if ((buf = malloc(bufsz)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: usage(%s) malloc failure(%s)\n"), node->ip_resource, strerror(errno)); return (NULL); } - bzero(buf, bufsz); - (void) sprintf(buf, fmt, link); - - if (offline || (numifs == 0)) { /* Nothing else to do */ - rcm_log_message(RCM_TRACE2, "IP: usage (%s) info = %s\n", - node->ip_resource, buf); - - return (buf); - } - - for (lif = node->ip_pif->pi_lifs; lif != NULL; lif = lif->li_next) { + (void) snprintf(buf, bufsz, "%s: %s", link, msg); - void *addr; - int af; - - if (!(lif->li_ifflags & IFF_UP)) { - /* ignore interfaces not up */ - continue; - } - af = lif->li_addr.family; - if (af == AF_INET6) { - addr = &lif->li_addr.ip6.sin6_addr; - } else if (af == AF_INET) { - addr = &lif->li_addr.ip4.sin_addr; + if (!offline && numup > 0) { + if (ipmp) { + (void) strlcat(buf, node->ip_pif->pi_grname, bufsz); } else { - rcm_log_message(RCM_DEBUG, - "IP: unknown addr family %d, assuming AF_INET\n", - af); - af = AF_INET; - addr = &lif->li_addr.ip4.sin_addr; - } - if (inet_ntop(af, addr, addrstr, INET6_ADDRSTRLEN) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: inet_ntop: %s\n"), strerror(errno)); - continue; - } - rcm_log_message(RCM_DEBUG, "IP addr := %s\n", addrstr); + lif = node->ip_pif->pi_lifs; + for (; lif != NULL; lif = lif->li_next) { + if (!(lif->li_ifflags & IFF_UP)) + continue; + + if (!ip_addrstr(lif, addrstr, sizeof (addrstr))) + continue; - (void) strcat(buf, addrstr); - numifs--; - if (numifs > 0) { - (void) strcat(buf, ", "); + (void) strlcat(buf, addrstr, bufsz); + if (--numup > 0) + (void) strlcat(buf, sep, bufsz); + } } } @@ -983,6 +876,32 @@ ip_usage(ip_cache_t *node) return (buf); } +static boolean_t +ip_addrstr(ip_lif_t *lif, char *addrstr, size_t addrsize) +{ + int af = lif->li_addr.family; + void *addr; + + if (af == AF_INET6) { + addr = &lif->li_addr.ip6.sin6_addr; + } else if (af == AF_INET) { + addr = &lif->li_addr.ip4.sin_addr; + } else { + rcm_log_message(RCM_DEBUG, + "IP: unknown addr family %d, assuming AF_INET\n", af); + af = AF_INET; + addr = &lif->li_addr.ip4.sin_addr; + } + if (inet_ntop(af, addr, addrstr, addrsize) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: inet_ntop: %s\n"), strerror(errno)); + return (B_FALSE); + } + + rcm_log_message(RCM_DEBUG, "IP addr := %s\n", addrstr); + return (B_TRUE); +} + /* * Cache management routines, all cache management functions should be * be called with cache_lock held. @@ -1121,11 +1040,13 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) ifnumber = ifspec.ifsp_lun; /* Get the interface flags */ - (void) strcpy(lifreq.lifr_name, lifr->lifr_name); + (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); if (ioctl(sock, SIOCGLIFFLAGS, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFFLAGS(%s): %s\n"), - pif.pi_ifname, strerror(errno)); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFFLAGS(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + } return (-1); } (void) memcpy(&ifflags, &lifreq.lifr_flags, sizeof (ifflags)); @@ -1135,12 +1056,13 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) * - IFF_VIRTUAL: e.g., loopback and vni * - IFF_POINTOPOINT: e.g., sppp and ip.tun * - !IFF_MULTICAST: e.g., ip.6to4tun + * - IFF_IPMP: IPMP meta-interfaces * * Note: The !IFF_MULTICAST check can be removed once iptun is * implemented as a datalink. */ if (!(ifflags & IFF_MULTICAST) || - (ifflags & (IFF_POINTOPOINT | IFF_VIRTUAL))) { + (ifflags & (IFF_POINTOPOINT | IFF_VIRTUAL | IFF_IPMP))) { rcm_log_message(RCM_TRACE3, "IP: if ignored (%s)\n", pif.pi_ifname); return (0); @@ -1148,23 +1070,26 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) /* Get the interface group name for this interface */ if (ioctl(sock, SIOCGLIFGROUPNAME, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFGROUPNAME(%s): %s\n"), - lifreq.lifr_name, strerror(errno)); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFGROUPNAME(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + } return (-1); } /* copy the group name */ - (void) memcpy(&pif.pi_grpname, &lifreq.lifr_groupname, - sizeof (pif.pi_grpname)); - pif.pi_grpname[sizeof (pif.pi_grpname) - 1] = '\0'; + (void) strlcpy(pif.pi_grname, lifreq.lifr_groupname, + sizeof (pif.pi_grname)); /* Get the interface address for this interface */ if (ioctl(sock, SIOCGLIFADDR, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFADDR(%s): %s\n"), - lifreq.lifr_name, strerror(errno)); - return (-1); + if (errno != ENXIO) { + rcm_log_message(RCM_ERROR, + _("IP: SIOCGLIFADDR(%s): %s\n"), + lifreq.lifr_name, strerror(errno)); + return (-1); + } } (void) memcpy(&ifaddr, &lifreq.lifr_addr, sizeof (ifaddr)); @@ -1241,9 +1166,9 @@ update_pif(rcm_handle_t *hd, int af, int sock, struct lifreq *lifr) sizeof (pif.pi_ifname)); } - /* save pif properties */ - (void) memcpy(&probepif->pi_grpname, &pif.pi_grpname, - sizeof (pif.pi_grpname)); + /* save the group name */ + (void) strlcpy(probepif->pi_grname, pif.pi_grname, + sizeof (pif.pi_grname)); /* add lif, if this is a lif and it is not in cache */ if (!lif_listed) { @@ -1304,7 +1229,7 @@ update_ipifs(rcm_handle_t *hd, int af) } lifn.lifn_family = af; - lifn.lifn_flags = 0; + lifn.lifn_flags = LIFC_UNDER_IPMP; if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { rcm_log_message(RCM_ERROR, _("IP: SIOCLGIFNUM failed: %s\n"), @@ -1321,7 +1246,7 @@ update_ipifs(rcm_handle_t *hd, int af) } lifc.lifc_family = af; - lifc.lifc_flags = 0; + lifc.lifc_flags = LIFC_UNDER_IPMP; lifc.lifc_len = sizeof (struct lifreq) * lifn.lifn_count; lifc.lifc_buf = buf; @@ -1480,39 +1405,33 @@ static void ip_log_err(ip_cache_t *node, char **errorp, char *errmsg) { char *ifname = NULL; - int len; + int size; const char *errfmt; - char *error; + char *error = NULL; if ((node != NULL) && (node->ip_pif != NULL) && (node->ip_pif->pi_ifname != NULL)) { ifname = node->ip_pif->pi_ifname; } - if (errorp != NULL) - *errorp = NULL; - if (ifname == NULL) { rcm_log_message(RCM_ERROR, _("IP: %s\n"), errmsg); errfmt = _("IP: %s"); - len = strlen(errfmt) + strlen(errmsg) + 1; - if (error = (char *)calloc(1, len)) { - (void) sprintf(error, errfmt, errmsg); - } + size = strlen(errfmt) + strlen(errmsg) + 1; + if (errorp != NULL && (error = malloc(size)) != NULL) + (void) snprintf(error, size, errfmt, errmsg); } else { rcm_log_message(RCM_ERROR, _("IP: %s(%s)\n"), errmsg, ifname); errfmt = _("IP: %s(%s)"); - len = strlen(errfmt) + strlen(errmsg) + strlen(ifname) + 1; - if (error = (char *)calloc(1, len)) { - (void) sprintf(error, errfmt, errmsg, ifname); - } + size = strlen(errfmt) + strlen(errmsg) + strlen(ifname) + 1; + if (errorp != NULL && (error = malloc(size)) != NULL) + (void) snprintf(error, size, errfmt, errmsg, ifname); } if (errorp != NULL) *errorp = error; } - /* * if_cfginfo() - Save off the config info for all interfaces */ @@ -1538,7 +1457,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: get modlist error (%s) %s\n"), pif->pi_ifname, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } @@ -1551,7 +1470,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: module %s@%d\n"), lif->li_modules[i], i); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } } @@ -1595,11 +1514,11 @@ if_cfginfo(ip_cache_t *node, uint_t force) /* Save reconfiguration information */ if (lif->li_ifflags & IFF_IPV4) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s:%d configinfo\n", USR_SBIN_IFCONFIG, + "%s %s:%d configinfo\n", SBIN_IFCONFIG, pif->pi_ifname, lif->li_ifnum); } else if (lif->li_ifflags & IFF_IPV6) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s:%d inet6 configinfo\n", USR_SBIN_IFCONFIG, + "%s %s:%d inet6 configinfo\n", SBIN_IFCONFIG, pif->pi_ifname, lif->li_ifnum); } rcm_log_message(RCM_TRACE2, "IP: %s\n", syscmd); @@ -1609,7 +1528,7 @@ if_cfginfo(ip_cache_t *node, uint_t force) rcm_log_message(RCM_ERROR, _("IP: ifconfig configinfo error (%s:%d) %s\n"), pif->pi_ifname, lif->li_ifnum, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } bzero(buf, MAX_RECONFIG_SIZE); @@ -1619,20 +1538,18 @@ if_cfginfo(ip_cache_t *node, uint_t force) _("IP: ifconfig configinfo error (%s:%d) %s\n"), pif->pi_ifname, lif->li_ifnum, strerror(errno)); (void) pclose(fp); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } (void) pclose(fp); - lif->li_reconfig = malloc(strlen(buf)+1); - if (lif->li_reconfig == NULL) { + if ((lif->li_reconfig = strdup(buf)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: malloc error (%s) %s\n"), pif->pi_ifname, strerror(errno)); - (void) clr_cfg_state(pif); + clr_cfg_state(pif); return (-1); } - (void) strcpy(lif->li_reconfig, buf); rcm_log_message(RCM_DEBUG, "IP: if_cfginfo: reconfig string(%s:%d) = %s\n", pif->pi_ifname, lif->li_ifnum, lif->li_reconfig); @@ -1654,57 +1571,37 @@ static int if_unplumb(ip_cache_t *node) { ip_lif_t *lif; - ip_pif_t *pif; - int ipv4 = 0, ipv6 = 0; - char syscmd[MAX_RECONFIG_SIZE + LIFNAMSIZ]; + ip_pif_t *pif = node->ip_pif; + boolean_t ipv4 = B_FALSE; + boolean_t ipv6 = B_FALSE; rcm_log_message(RCM_TRACE2, "IP: if_unplumb(%s)\n", node->ip_resource); - pif = node->ip_pif; - lif = pif->pi_lifs; - - while (lif != NULL) { + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { if (lif->li_ifflags & IFF_IPV4) { - ipv4++; + ipv4 = B_TRUE; } else if (lif->li_ifflags & IFF_IPV6) { - ipv6++; + ipv6 = B_TRUE; } else { /* Unlikely case */ rcm_log_message(RCM_DEBUG, "IP: Unplumb ignored (%s:%d)\n", pif->pi_ifname, lif->li_ifnum); - lif = lif->li_next; - continue; } - lif = lif->li_next; } - /* Unplumb the physical interface */ - if (ipv4) { - rcm_log_message(RCM_TRACE2, - "IP: if_unplumb: ifconfig %s unplumb\n", pif->pi_ifname); - (void) snprintf(syscmd, sizeof (syscmd), "%s %s unplumb\n", - USR_SBIN_IFCONFIG, pif->pi_ifname); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot unplumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); - return (-1); - } + if (ipv4 && !ifconfig(pif->pi_ifname, "inet", "unplumb", B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot unplumb (%s) %s\n"), + pif->pi_ifname, strerror(errno)); + return (-1); } - if (ipv6) { - rcm_log_message(RCM_TRACE2, - "IP: if_unplumb: ifconfig %s inet6 unplumb\n", - pif->pi_ifname); - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet6 unplumb\n", USR_SBIN_IFCONFIG, pif->pi_ifname); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot unplumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); - return (-1); - } + + if (ipv6 && !ifconfig(pif->pi_ifname, "inet6", "unplumb", B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot unplumb (%s) %s\n"), + pif->pi_ifname, strerror(errno)); + return (-1); } + rcm_log_message(RCM_TRACE2, "IP: if_unplumb(%s) success\n", node->ip_resource); @@ -1723,8 +1620,11 @@ if_replumb(ip_cache_t *node) ip_lif_t *lif; ip_pif_t *pif; int i; - char syscmd[LIFNAMSIZ+MAXPATHLEN]; /* must be big enough */ - int max_ipv4 = 0, max_ipv6 = 0; + boolean_t success, ipmp; + const char *fstr; + char lifname[LIFNAMSIZ]; + char buf[MAX_RECONFIG_SIZE]; + int max_lifnum = 0; rcm_log_message(RCM_TRACE2, "IP: if_replumb(%s)\n", node->ip_resource); @@ -1738,100 +1638,103 @@ if_replumb(ip_cache_t *node) */ pif = node->ip_pif; - lif = pif->pi_lifs; + ipmp = (node->ip_pif->pi_grname[0] != '\0'); /* * Make a first pass to plumb in physical interfaces and get a count * of the max logical interfaces */ - while (lif != NULL) { + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { + max_lifnum = MAX(lif->li_ifnum, max_lifnum); if (lif->li_ifflags & IFF_IPV4) { - if (lif->li_ifnum > max_ipv4) { - max_ipv4 = lif->li_ifnum; - } + fstr = "inet"; } else if (lif->li_ifflags & IFF_IPV6) { - if (lif->li_ifnum > max_ipv6) { - max_ipv6 = lif->li_ifnum; - } + fstr = "inet6"; } else { /* Unlikely case */ rcm_log_message(RCM_DEBUG, "IP: Re-plumb ignored (%s:%d)\n", pif->pi_ifname, lif->li_ifnum); - lif = lif->li_next; continue; } - if (lif->li_ifnum == 0) { /* physical interface instance */ - if ((lif->li_ifflags & IFF_NOFAILOVER) || - (strcmp(pif->pi_grpname, "") == 0)) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s\n", USR_SBIN_IFCONFIG, - lif->li_reconfig); - } else if (lif->li_ifflags & IFF_IPV4) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet plumb group %s\n", - USR_SBIN_IFCONFIG, - pif->pi_ifname, pif->pi_grpname); - } else if (lif->li_ifflags & IFF_IPV6) { - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s inet6 plumb group %s\n", - USR_SBIN_IFCONFIG, - pif->pi_ifname, pif->pi_grpname); - } + /* ignore logical interface instances */ + if (lif->li_ifnum != 0) + continue; + + if ((lif->li_ifflags & IFF_NOFAILOVER) || !ipmp) { + success = ifconfig("", "", lif->li_reconfig, B_FALSE); + } else { + (void) snprintf(buf, sizeof (buf), "plumb group %s", + pif->pi_grname); + success = ifconfig(pif->pi_ifname, fstr, buf, B_FALSE); + } + + if (!success) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot plumb (%s) %s\n"), pif->pi_ifname, + strerror(errno)); + return (-1); + } + + /* + * Restart DHCP if necessary. + */ + if ((lif->li_ifflags & IFF_DHCPRUNNING) && + !ifconfig(pif->pi_ifname, fstr, CFG_DHCP_CMD, B_FALSE)) { + rcm_log_message(RCM_ERROR, _("IP: Cannot start DHCP " + "(%s) %s\n"), pif->pi_ifname, strerror(errno)); + return (-1); + } + rcm_log_message(RCM_TRACE2, + "IP: if_replumb: Modcnt = %d\n", lif->li_modcnt); + /* modinsert modules in order, ignore driver(last) */ + for (i = 0; i < (lif->li_modcnt - 1); i++) { rcm_log_message(RCM_TRACE2, - "IP: if_replumb: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { + "IP: modinsert: Pos = %d Mod = %s\n", + i, lif->li_modules[i]); + if (modop(pif->pi_ifname, lif->li_modules[i], i, + MOD_INSERT) == -1) { rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), - pif->pi_ifname, strerror(errno)); + _("IP: modinsert error(%s)\n"), + pif->pi_ifname); return (-1); } - - rcm_log_message(RCM_TRACE2, - "IP: if_replumb: Modcnt = %d\n", lif->li_modcnt); - /* modinsert modules in order, ignore driver(last) */ - for (i = 0; i < (lif->li_modcnt - 1); i++) { - rcm_log_message(RCM_TRACE2, - "IP: modinsert: Pos = %d Mod = %s\n", - i, lif->li_modules[i]); - if (modop(pif->pi_ifname, lif->li_modules[i], i, - MOD_INSERT) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: modinsert error(%s)\n"), - pif->pi_ifname); - return (-1); - } - } } - - lif = lif->li_next; } /* Now, add all the logical interfaces in the correct order */ - for (i = 1; i <= MAX(max_ipv6, max_ipv4); i++) { + for (i = 1; i <= max_lifnum; i++) { + (void) snprintf(lifname, LIFNAMSIZ, "%s:%d", pif->pi_ifname, i); + /* reset lif through every iteration */ - lif = pif->pi_lifs; - while (lif != NULL) { - if (((lif->li_ifflags & IFF_NOFAILOVER) || - (strcmp(pif->pi_grpname, "") == 0)) && - (lif->li_ifnum == i)) { - /* Plumb in the logical interface */ - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s\n", USR_SBIN_IFCONFIG, - lif->li_reconfig); - rcm_log_message(RCM_TRACE2, - "IP: if_replumb: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot addif (%s:%d) " - "%s\n"), - pif->pi_ifname, i, strerror(errno)); - return (-1); - } + for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { + /* + * Process entries in order. If the interface is + * using IPMP, only process test addresses. + */ + if (lif->li_ifnum != i || + (ipmp && !(lif->li_ifflags & IFF_NOFAILOVER))) + continue; + + if (!ifconfig("", "", lif->li_reconfig, B_FALSE)) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot addif (%s) %s\n"), lifname, + strerror(errno)); + return (-1); + } + + /* + * Restart DHCP if necessary. + */ + if ((lif->li_ifflags & IFF_DHCPRUNNING) && + !ifconfig(lifname, fstr, CFG_DHCP_CMD, B_FALSE)) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot start DHCP (%s) %s\n"), + lifname, strerror(errno)); + return (-1); } - lif = lif->li_next; } } @@ -1865,71 +1768,64 @@ clr_cfg_state(ip_pif_t *pif) } /* - * ip_ipmp_offline() - Failover from if_from to if_to using a - * minimum redudancy of min_red. This uses IPMPs - * "offline" mechanism to achieve the failover. + * Attempt to offline ip_cache_t `node'; returns an IPMP error code. */ static int -ip_ipmp_offline(ip_cache_t *if_from, ip_cache_t *if_to) +ip_ipmp_offline(ip_cache_t *node) { - mpathd_cmd_t mpdcmd; - - if ((if_from == NULL) || (if_from->ip_pif == NULL) || - (if_from->ip_pif->pi_ifname == NULL)) { - return (-1); - } + int retval; + ipmp_handle_t handle; rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline\n"); - mpdcmd.cmd_command = MI_OFFLINE; - (void) strcpy(mpdcmd.cmd_ifname, if_from->ip_pif->pi_ifname); - - if ((if_to != NULL) && (if_to->ip_pif != NULL) && - (if_to->ip_pif->pi_ifname != NULL)) { - rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline (%s)->(%s)\n", - if_from->ip_pif->pi_ifname, if_to->ip_pif->pi_ifname); - (void) strncpy(mpdcmd.cmd_movetoif, if_to->ip_pif->pi_ifname, - sizeof (mpdcmd.cmd_movetoif)); - mpdcmd.cmd_movetoif[sizeof (mpdcmd.cmd_movetoif) - 1] = '\0'; - } else { - rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_offline (%s)->(any)\n", - if_from->ip_pif->pi_ifname); - (void) strcpy(mpdcmd.cmd_movetoif, ""); /* signifies any */ + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("IP: cannot create ipmp handle: %s\n"), + ipmp_errmsg(retval)); + return (retval); } - mpdcmd.cmd_min_red = if_from->ip_ifred; - if (mpathd_send_cmd(&mpdcmd) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd offline error: %s\n"), - strerror(errno)); - return (-1); + retval = ipmp_offline(handle, node->ip_pif->pi_ifname, node->ip_ifred); + if (retval != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, _("IP: ipmp_offline error: %s\n"), + ipmp_errmsg(retval)); + } else { + rcm_log_message(RCM_TRACE1, "IP: ipmp_offline success\n"); } - rcm_log_message(RCM_TRACE1, "IP: ipmp offline success\n"); - return (0); + ipmp_close(handle); + return (retval); } /* - * ip_ipmp_undo_offline() - Undo prior offline of the interface. - * This uses IPMPs "undo offline" feature. + * Attempt to undo the offline ip_cache_t `node'; returns an IPMP error code. */ static int ip_ipmp_undo_offline(ip_cache_t *node) { - mpathd_cmd_t mpdcmd; + int retval; + ipmp_handle_t handle; - mpdcmd.cmd_command = MI_UNDO_OFFLINE; - (void) strcpy(mpdcmd.cmd_ifname, node->ip_pif->pi_ifname); + rcm_log_message(RCM_TRACE1, "IP: ip_ipmp_undo_offline\n"); - if (mpathd_send_cmd(&mpdcmd) < 0) { + if ((retval = ipmp_open(&handle)) != IPMP_SUCCESS) { rcm_log_message(RCM_ERROR, - _("IP: mpathd error: %s\n"), - strerror(errno)); - return (-1); + _("IP: cannot create ipmp handle: %s\n"), + ipmp_errmsg(retval)); + return (retval); } - rcm_log_message(RCM_TRACE1, "IP: ipmp undo offline success\n"); - return (0); + retval = ipmp_undo_offline(handle, node->ip_pif->pi_ifname); + if (retval != IPMP_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("IP: ipmp_undo_offline error: %s\n"), + ipmp_errmsg(retval)); + } else { + rcm_log_message(RCM_TRACE1, "IP: ipmp_undo_offline success\n"); + } + + ipmp_close(handle); + return (retval); } /* @@ -1946,10 +1842,9 @@ get_link_resource(const char *link) char *resource; dladm_status_t status; - if ((status = dladm_name2info(dld_handle, link, &linkid, &flags, NULL, - NULL)) != DLADM_STATUS_OK) { + status = dladm_name2info(dld_handle, link, &linkid, &flags, NULL, NULL); + if (status != DLADM_STATUS_OK) goto fail; - } if (!(flags & DLADM_OPT_ACTIVE)) { status = DLADM_STATUS_FAILED; @@ -1976,243 +1871,6 @@ fail: } /* - * if_get_flags() - Return the cached physical interface flags - * Call with cache_lock held - */ -static uint64_t -if_get_flags(ip_pif_t *pif) -{ - ip_lif_t *lif; - - for (lif = pif->pi_lifs; lif != NULL; lif = lif->li_next) { - if (lif->li_ifnum == 0) { - return (lif->li_ifflags & RCM_PIF_FLAGS); - } - } - return (0); -} - -/* - * mpathd_send_cmd() - Sends the command to in.mpathd. - */ -static int -mpathd_send_cmd(mpathd_cmd_t *mpd) -{ - mpathd_unoffline_t mpc; - struct mpathd_response mpr; - int i; - int s; - - rcm_log_message(RCM_TRACE1, "IP: mpathd_send_cmd \n"); - - for (i = 0; i < MPATHD_MAX_RETRIES; i++) { - s = connect_to_mpathd(AF_INET); - if (s == -1) { - s = connect_to_mpathd(AF_INET6); - if (s == -1) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot talk to mpathd\n")); - return (-1); - } - } - switch (mpd->cmd_command) { - case MI_OFFLINE : - rcm_log_message(RCM_TRACE1, "IP: MI_OFFLINE: " - "(%s)->(%s) redundancy = %d\n", mpd->cmd_ifname, - mpd->cmd_movetoif, mpd->cmd_min_red); - - if (write(s, mpd, sizeof (mpathd_cmd_t)) != - sizeof (mpathd_cmd_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - - case MI_SETOINDEX : - rcm_log_message(RCM_TRACE1, "IP: MI_SETOINDEX: " - "(%s)->(%s) family = %d\n", mpd->from_lifname, - mpd->to_pifname, mpd->addr_family); - - if (write(s, mpd, sizeof (mpathd_cmd_t)) != - sizeof (mpathd_cmd_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - - case MI_UNDO_OFFLINE: - /* mpathd checks for exact size of the message */ - mpc.cmd_command = mpd->cmd_command; - (void) strcpy(mpc.cmd_ifname, mpd->cmd_ifname); - - rcm_log_message(RCM_TRACE1, "IP: MI_UNDO_OFFLINE: " - "(%s)\n", mpd->cmd_ifname); - - if (write(s, &mpc, sizeof (mpathd_unoffline_t)) != - sizeof (mpathd_unoffline_t)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd write: %s\n"), - strerror(errno)); - (void) close(s); - return (-1); - } - break; - default : - rcm_log_message(RCM_ERROR, - _("IP: unsupported mpathd command\n")); - (void) close(s); - return (-1); - } - - bzero(&mpr, sizeof (struct mpathd_response)); - /* Read the result from mpathd */ - if (read(s, &mpr, sizeof (struct mpathd_response)) != - sizeof (struct mpathd_response)) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd read : %s\n"), strerror(errno)); - (void) close(s); - return (-1); - } - - (void) close(s); - if (mpr.resp_mpathd_err == 0) { - rcm_log_message(RCM_TRACE1, - "IP: mpathd_send_cmd success\n"); - return (0); /* Successful */ - } - - if (mpr.resp_mpathd_err == MPATHD_SYS_ERROR) { - if (mpr.resp_sys_errno == EAGAIN) { - (void) sleep(1); - rcm_log_message(RCM_DEBUG, - "IP: mpathd retrying\n"); - continue; /* Retry */ - } - errno = mpr.resp_sys_errno; - rcm_log_message(RCM_WARNING, - _("IP: mpathd_send_cmd error: %s\n"), - strerror(errno)); - } else if (mpr.resp_mpathd_err == MPATHD_MIN_RED_ERROR) { - errno = EIO; - rcm_log_message(RCM_ERROR, _("IP: in.mpathd(1M): " - "Minimum redundancy not met\n")); - } else { - rcm_log_message(RCM_ERROR, - _("IP: mpathd_send_cmd error\n")); - } - /* retry */ - } - - rcm_log_message(RCM_ERROR, - _("IP: mpathd_send_cmd failed %d retries\n"), MPATHD_MAX_RETRIES); - return (-1); -} - -/* - * Returns -1 on failure. Returns the socket file descriptor on - * success. - */ -static int -connect_to_mpathd(int family) -{ - int s; - struct sockaddr_storage ss; - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - int addrlen; - int ret; - int on; - - rcm_log_message(RCM_TRACE1, "IP: connect_to_mpathd\n"); - - s = socket(family, SOCK_STREAM, 0); - if (s < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd socket: %s\n"), strerror(errno)); - return (-1); - } - bzero((char *)&ss, sizeof (ss)); - ss.ss_family = family; - /* - * Need to bind to a privelged port. For non-root, this - * will fail. in.mpathd verifies that only commands coming - * from priveleged ports succeed so that the ordinary user - * can't issue offline commands. - */ - on = 1; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd setsockopt: TCP_ANONPRIVBIND: %s\n"), - strerror(errno)); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = 0; - sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - addrlen = sizeof (struct sockaddr_in); - break; - case AF_INET6: - sin6->sin6_port = 0; - sin6->sin6_addr = loopback_addr; - addrlen = sizeof (struct sockaddr_in6); - break; - } - ret = bind(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd bind: %s\n"), strerror(errno)); - return (-1); - } - switch (family) { - case AF_INET: - sin->sin_port = htons(MPATHD_PORT); - break; - case AF_INET6: - sin6->sin6_port = htons(MPATHD_PORT); - break; - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - if (ret != 0) { - if (errno == ECONNREFUSED) { - /* in.mpathd is not running, start it */ - if (rcm_exec_cmd(MPATHD_PATH) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd exec: %s\n"), - strerror(errno)); - return (-1); - } - ret = connect(s, (struct sockaddr *)&ss, addrlen); - } - if (ret != 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd connect: %s\n"), strerror(errno)); - return (-1); - } - } - on = 0; - if (setsockopt(s, IPPROTO_TCP, TCP_ANONPRIVBIND, &on, - sizeof (on)) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: mpathd setsockopt TCP_ANONPRIVBIND: %s\n"), - strerror(errno)); - return (-1); - } - - rcm_log_message(RCM_TRACE1, "IP: connect_to_mpathd success\n"); - - return (s); -} - -/* * modop() - Remove/insert a module */ static int @@ -2239,12 +1897,10 @@ modop(char *name, char *arg, int pos, char op) if (op == MOD_REMOVE) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s modremove %s@%d\n", USR_SBIN_IFCONFIG, name, arg, - pos); + "%s %s modremove %s@%d\n", SBIN_IFCONFIG, name, arg, pos); } else if (op == MOD_INSERT) { (void) snprintf(syscmd, sizeof (syscmd), - "%s %s modinsert %s@%d\n", USR_SBIN_IFCONFIG, name, arg, - pos); + "%s %s modinsert %s@%d\n", SBIN_IFCONFIG, name, arg, pos); } else { rcm_log_message(RCM_ERROR, _("IP: modop(%s): unknown operation\n"), name); @@ -2277,11 +1933,11 @@ get_modlist(char *name, ip_lif_t *lif) int i; int num_mods; struct lifreq lifr; - struct str_list strlist; + struct str_list strlist = { 0 }; rcm_log_message(RCM_TRACE1, "IP: getmodlist(%s)\n", name); - (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); + (void) strlcpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); lifr.lifr_flags = lif->li_ifflags; if (ip_domux2fd(&mux_fd, &muxid_fd, &fd, &lifr) < 0) { rcm_log_message(RCM_ERROR, _("IP: ip_domux2fd(%s)\n"), name); @@ -2292,39 +1948,34 @@ get_modlist(char *name, ip_lif_t *lif) rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): I_LIST(%s) \n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } strlist.sl_nmods = num_mods; strlist.sl_modlist = malloc(sizeof (struct str_mlist) * num_mods); - if (strlist.sl_modlist == NULL) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } if (ioctl(fd, I_LIST, (caddr_t)&strlist) < 0) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): I_LIST error: %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + goto fail; } for (i = 0; i < strlist.sl_nmods; i++) { - lif->li_modules[i] = - malloc(strlen(strlist.sl_modlist[i].l_name)+1); + lif->li_modules[i] = strdup(strlist.sl_modlist[i].l_name); if (lif->li_modules[i] == NULL) { rcm_log_message(RCM_ERROR, _("IP: get_modlist(%s): %s\n"), name, strerror(errno)); - (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); - return (-1); + while (i > 0) + free(lif->li_modules[--i]); + goto fail; } - (void) strcpy(lif->li_modules[i], strlist.sl_modlist[i].l_name); } lif->li_modcnt = strlist.sl_nmods; @@ -2332,6 +1983,10 @@ get_modlist(char *name, ip_lif_t *lif) rcm_log_message(RCM_TRACE1, "IP: getmodlist(%s) success\n", name); return (ip_plink(mux_fd, muxid_fd, fd, &lifr)); +fail: + free(strlist.sl_modlist); + (void) ip_plink(mux_fd, muxid_fd, fd, &lifr); + return (-1); } /* @@ -2436,6 +2091,7 @@ ip_plink(int mux_fd, int muxid_fd, int fd, struct lifreq *lifr) * * Notify online to IP address consumers. */ +/*ARGSUSED*/ static int ip_onlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, rcm_info_t **depend_info) @@ -2464,6 +2120,7 @@ ip_onlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, * * Offline IP address consumers. */ +/*ARGSUSED*/ static int ip_offlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, rcm_info_t **depend_info) @@ -2494,9 +2151,9 @@ ip_offlinelist(rcm_handle_t *hd, ip_cache_t *node, char **errorp, uint_t flags, } /* - * ip_get_addrlist() - Compile list of IP addresses hosted on this NIC (node) - * This routine malloc() required memeory for the list - * Returns list on success, NULL if failed + * ip_get_addrlist() - Get the list of IP addresses on this interface (node); + * This routine malloc()s required memory for the list. + * Returns the list on success, NULL on failure. * Call with cache_lock held. */ static char ** @@ -2504,11 +2161,9 @@ ip_get_addrlist(ip_cache_t *node) { ip_lif_t *lif; char **addrlist = NULL; - int numifs; + int i, numifs; + size_t addrlistsize; char addrstr[INET6_ADDRSTRLEN]; - void *addr; - int af; - int i; rcm_log_message(RCM_TRACE2, "IP: ip_get_addrlist(%s)\n", node->ip_resource); @@ -2532,35 +2187,21 @@ ip_get_addrlist(ip_cache_t *node) for (lif = node->ip_pif->pi_lifs, i = 0; lif != NULL; lif = lif->li_next, i++) { - af = lif->li_addr.family; - if (af == AF_INET6) { - addr = &lif->li_addr.ip6.sin6_addr; - } else if (af == AF_INET) { - addr = &lif->li_addr.ip4.sin_addr; - } else { - rcm_log_message(RCM_DEBUG, - "IP: unknown addr family %d, assuming AF_INET\n", - af); - af = AF_INET; - addr = &lif->li_addr.ip4.sin_addr; - } - if (inet_ntop(af, addr, addrstr, INET6_ADDRSTRLEN) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: inet_ntop: %s\n"), strerror(errno)); + if (!ip_addrstr(lif, addrstr, sizeof (addrstr))) { ip_free_addrlist(addrlist); return (NULL); } - if ((addrlist[i] = malloc(strlen(addrstr) + RCM_SIZE_SUNW_IP)) - == NULL) { + addrlistsize = strlen(addrstr) + sizeof (RCM_STR_SUNW_IP); + if ((addrlist[i] = malloc(addrlistsize)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: ip_get_addrlist(%s) malloc failure(%s)\n"), node->ip_resource, strerror(errno)); ip_free_addrlist(addrlist); return (NULL); } - (void) strcpy(addrlist[i], RCM_STR_SUNW_IP); /* SUNW_ip/ */ - (void) strcat(addrlist[i], addrstr); /* SUNW_ip/<address> */ + (void) snprintf(addrlist[i], addrlistsize, "%s%s", + RCM_STR_SUNW_IP, addrstr); rcm_log_message(RCM_DEBUG, "Anon Address: %s\n", addrlist[i]); } @@ -2611,16 +2252,13 @@ ip_consumer_notify(rcm_handle_t *hd, datalink_id_t linkid, char **errorp, return; } /* - * Inform anonymous consumers about IP addresses being - * onlined + * Inform anonymous consumers about IP addresses being onlined. */ (void) ip_onlinelist(hd, node, errorp, flags, depend_info); (void) mutex_unlock(&cache_lock); rcm_log_message(RCM_TRACE2, "IP: ip_consumer_notify success\n"); - return; - } /* @@ -2632,20 +2270,18 @@ if_configure(datalink_id_t linkid) char ifinst[MAXLINKNAMELEN]; char cfgfile[MAXPATHLEN]; char cached_name[RCM_LINK_RESOURCE_MAX]; - struct stat statbuf; + FILE *hostfp, *host6fp; ip_cache_t *node; - int af = 0; - int ipmp = 0; + boolean_t ipmp = B_FALSE; assert(linkid != DATALINK_INVALID_LINKID); - rcm_log_message(RCM_TRACE1, _("IP: if_configure(%u)\n"), linkid); /* Check for the interface in the cache */ (void) snprintf(cached_name, sizeof (cached_name), "%s/%u", RCM_LINK_PREFIX, linkid); - /* Check if the interface is new or was previously offlined */ + /* Check if the interface is new or was not previously offlined */ (void) mutex_lock(&cache_lock); if (((node = cache_lookup(NULL, cached_name, CACHE_REFRESH)) != NULL) && (!(node->ip_cachestate & CACHE_IF_OFFLINED))) { @@ -2663,76 +2299,69 @@ if_configure(datalink_id_t linkid) return (-1); } - /* Scan IPv4 configuration first */ - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV4, ifinst); - cfgfile[MAXPATHLEN - 1] = '\0'; - + /* + * Scan the IPv4 and IPv6 hostname files to see if (a) they exist + * and (b) if either one places the interface into an IPMP group. + */ + (void) snprintf(cfgfile, MAXPATHLEN, CFGFILE_FMT_IPV4, ifinst); rcm_log_message(RCM_TRACE1, "IP: Scanning %s\n", cfgfile); - if (stat(cfgfile, &statbuf) == 0) { - af |= CONFIG_AF_INET; - if (isgrouped(cfgfile)) { - ipmp++; - } + if ((hostfp = fopen(cfgfile, "r")) != NULL) { + if (isgrouped(cfgfile)) + ipmp = B_TRUE; } - /* Scan IPv6 configuration details */ - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV6, ifinst); - cfgfile[MAXPATHLEN - 1] = '\0'; + (void) snprintf(cfgfile, MAXPATHLEN, CFGFILE_FMT_IPV6, ifinst); rcm_log_message(RCM_TRACE1, "IP: Scanning %s\n", cfgfile); - if (stat(cfgfile, &statbuf) == 0) { - af |= CONFIG_AF_INET6; - if ((ipmp == 0) && isgrouped(cfgfile)) { - ipmp++; - } + if ((host6fp = fopen(cfgfile, "r")) != NULL) { + if (!ipmp && isgrouped(cfgfile)) + ipmp = B_TRUE; } - if (af & CONFIG_AF_INET) { - if (if_ipmp_config(ifinst, CONFIG_AF_INET, ipmp) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: IPv4 Post-attach failed (%s)\n"), ifinst); - return (-1); - } + /* + * Configure the interface according to its hostname files. + */ + if (hostfp != NULL && + if_config_inst(ifinst, hostfp, AF_INET, ipmp) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: IPv4 Post-attach failed (%s)\n"), ifinst); + goto fail; } - if (af & CONFIG_AF_INET6) { - if (if_ipmp_config(ifinst, CONFIG_AF_INET6, ipmp) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: IPv6 Post-attach failed(%s)\n"), ifinst); - return (-1); - } + if (host6fp != NULL && + if_config_inst(ifinst, host6fp, AF_INET6, ipmp) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: IPv6 Post-attach failed (%s)\n"), ifinst); + goto fail; } + (void) fclose(hostfp); + (void) fclose(host6fp); rcm_log_message(RCM_TRACE1, "IP: if_configure(%s) success\n", ifinst); - return (0); - +fail: + (void) fclose(hostfp); + (void) fclose(host6fp); + return (-1); } /* - * isgrouped() - Scans the given config file to see if this is a grouped - * interface - * Returns non-zero if true; 0 if false + * isgrouped() - Scans the given config file to see if this interface is + * using IPMP. Returns B_TRUE or B_FALSE. */ -static int -isgrouped(char *cfgfile) +static boolean_t +isgrouped(const char *cfgfile) { FILE *fp; struct stat statb; - char *buf = NULL; - char *tokens[MAXARGS]; /* token pointers */ - char tspace[MAXLINE]; /* token space */ - int ntok; - int group = 0; - - if (cfgfile == NULL) - return (0); + char *nlp, *line, *token, *lasts, *buf; + boolean_t grouped = B_FALSE; rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s)\n", cfgfile); if (stat(cfgfile, &statb) != 0) { rcm_log_message(RCM_TRACE1, _("IP: No config file(%s)\n"), cfgfile); - return (0); + return (B_FALSE); } /* @@ -2744,609 +2373,284 @@ isgrouped(char *cfgfile) if (statb.st_size <= 1) { rcm_log_message(RCM_TRACE1, _("IP: Empty config file(%s)\n"), cfgfile); - return (0); + return (B_FALSE); } if ((fp = fopen(cfgfile, "r")) == NULL) { rcm_log_message(RCM_ERROR, _("IP: Cannot open configuration file(%s): %s\n"), cfgfile, strerror(errno)); - return (0); + return (B_FALSE); } - if ((buf = calloc(1, statb.st_size)) == NULL) { + if ((buf = malloc(statb.st_size)) == NULL) { rcm_log_message(RCM_ERROR, - _("IP: calloc failure(%s): %s\n"), cfgfile, + _("IP: malloc failure(%s): %s\n"), cfgfile, strerror(errno)); - (void) fclose(fp); - return (0); + goto out; } while (fgets(buf, statb.st_size, fp) != NULL) { - if (*buf == '\0') - continue; - - tokenize(buf, tokens, tspace, &ntok); - while (ntok) { - if (STREQ("group", tokens[ntok - 1])) { - if (tokens[ntok] != NULL) { - group++; - } + if ((nlp = strrchr(buf, '\n')) != NULL) + *nlp = '\0'; + + line = buf; + while ((token = strtok_r(line, " \t", &lasts)) != NULL) { + line = NULL; + if (STREQ("group", token) && + strtok_r(NULL, " \t", &lasts) != NULL) { + grouped = B_TRUE; + goto out; } - ntok--; } } - +out: free(buf); - (void) fclose(fp); - if (group <= 0) { - rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s) non-grouped\n", - cfgfile); - return (0); - } else { - rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s) grouped\n", - cfgfile); - return (1); - } -} + rcm_log_message(RCM_TRACE1, "IP: isgrouped(%s): %d\n", cfgfile, + grouped); + return (grouped); +} /* - * if_ipmp_config() - Configure an interface instance as specified by the + * if_config_inst() - Configure an interface instance as specified by the * address family af and if it is grouped (ipmp). */ static int -if_ipmp_config(char *ifinst, int af, int ipmp) +if_config_inst(const char *ifinst, FILE *hfp, int af, boolean_t ipmp) { - char cfgfile[MAXPATHLEN]; /* configuration file */ - FILE *fp; + FILE *ifparsefp; struct stat statb; - char *buf; - char *tokens[MAXARGS]; /* list of config attributes */ - char tspace[MAXLINE]; /* token space */ - char syscmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; - char grpcmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; - char fstr[8]; /* address family string inet or inet6 */ - int nofailover = 0; - int newattach = 0; - int cmdvalid = 0; - int ntok; - int n; - int stdif = 0; - - if (ifinst == NULL) - return (0); + char *buf = NULL; + char *ifparsebuf = NULL; + uint_t ifparsebufsize; + const char *fstr; /* address family string */ + boolean_t stdif = B_FALSE; - rcm_log_message(RCM_TRACE1, "IP: if_ipmp_config(%s) ipmp = %d\n", + rcm_log_message(RCM_TRACE1, "IP: if_config_inst(%s) ipmp = %d\n", ifinst, ipmp); - if (af & CONFIG_AF_INET) { - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV4, - ifinst); - (void) strcpy(fstr, "inet"); - } else if (af & CONFIG_AF_INET6) { - (void) snprintf(cfgfile, MAXPATHLEN, "%s%s", CFGFILE_FMT_IPV6, - ifinst); - (void) strcpy(fstr, "inet6"); - } else { - return (0); /* nothing to do */ - } - - cfgfile[MAXPATHLEN - 1] = '\0'; - grpcmd[0] = '\0'; - - if (stat(cfgfile, &statb) != 0) { - rcm_log_message(RCM_TRACE1, - "IP: No config file(%s)\n", ifinst); - return (0); + if (fstat(fileno(hfp), &statb) != 0) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot fstat file(%s)\n"), ifinst); + goto fail; } - /* Config file exists, plumb in the physical interface */ - if (af & CONFIG_AF_INET6) { - if (if_getcount(AF_INET6) == 0) { - /* - * Configure software loopback driver if this is the - * first IPv6 interface plumbed - */ - newattach++; - (void) snprintf(syscmd, sizeof (syscmd), - "%s lo0 %s plumb ::1 up", USR_SBIN_IFCONFIG, fstr); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), - ifinst, strerror(errno)); - return (-1); - } - } - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s plumb up", - USR_SBIN_IFCONFIG, ifinst, fstr); - } else { - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s plumb ", - USR_SBIN_IFCONFIG, ifinst, fstr); - if (if_getcount(AF_INET) == 0) { - newattach++; - } + switch (af) { + case AF_INET: + fstr = "inet"; + break; + case AF_INET6: + fstr = "inet6"; + break; + default: + assert(0); } - rcm_log_message(RCM_TRACE1, "IP: Exec: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Cannot plumb (%s) %s\n"), ifinst, strerror(errno)); - return (-1); - } + /* + * The hostname file exists; plumb the physical interface. + */ + if (!ifconfig(ifinst, fstr, "plumb", B_FALSE)) + goto fail; - /* Check if config file is empty, if so, nothing else to do */ - if (statb.st_size == 0) { + /* Skip static configuration if the hostname file is empty */ + if (statb.st_size <= 1) { rcm_log_message(RCM_TRACE1, - "IP: Zero size config file(%s)\n", ifinst); - return (0); + _("IP: Zero size hostname file(%s)\n"), ifinst); + goto configured; } - if ((fp = fopen(cfgfile, "r")) == NULL) { + if (fseek(hfp, 0, SEEK_SET) == -1) { rcm_log_message(RCM_ERROR, - _("IP: Open error(%s): %s\n"), cfgfile, strerror(errno)); - return (-1); + _("IP: Cannot rewind hostname file(%s): %s\n"), ifinst, + strerror(errno)); + goto fail; } + /* + * Allocate the worst-case single-line buffer sizes. A bit skanky, + * but since hostname files are small, this should suffice. + */ if ((buf = calloc(1, statb.st_size)) == NULL) { rcm_log_message(RCM_ERROR, _("IP: calloc(%s): %s\n"), ifinst, strerror(errno)); - (void) fclose(fp); - return (-1); + goto fail; } - /* a single line with one token implies a classical if */ - if (fgets(buf, statb.st_size, fp) != NULL) { - tokenize(buf, tokens, tspace, &ntok); - if (ntok == 1) { - rcm_log_message(RCM_TRACE1, "IP: Standard interface\n"); - stdif++; - } - } - if (fseek(fp, 0L, SEEK_SET) == -1) { - rcm_log_message(RCM_ERROR, _("IP: fseek: %s\n"), - strerror(errno)); - return (-1); + ifparsebufsize = statb.st_size + sizeof (SBIN_IFPARSE " -s inet6 "); + if ((ifparsebuf = calloc(1, ifparsebufsize)) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: calloc(%s): %s\n"), ifinst, strerror(errno)); + goto fail; } /* - * Process the config command - * This loop also handles multiple logical interfaces that may - * be configured on a single line + * For IPv4, determine whether the hostname file consists of a single + * line. We need to handle these specially since they should + * automatically be suffixed with "netmask + broadcast + up". */ - while (fgets(buf, statb.st_size, fp) != NULL) { - nofailover = 0; - cmdvalid = 0; + if (af == AF_INET && + fgets(buf, statb.st_size, hfp) != NULL && + fgets(buf, statb.st_size, hfp) == NULL) { + rcm_log_message(RCM_TRACE1, "IP: one-line hostname file\n"); + stdif = B_TRUE; + } - if (*buf == '\0') - continue; + if (fseek(hfp, 0L, SEEK_SET) == -1) { + rcm_log_message(RCM_ERROR, + _("IP: Cannot rewind hostname file(%s): %s\n"), ifinst, + strerror(errno)); + goto fail; + } - tokenize(buf, tokens, tspace, &ntok); - if (ntok <= 0) + /* + * Loop through the file one line at a time and feed it to ifconfig. + * If the interface is using IPMP, then we use /sbin/ifparse -s to + * weed out all of the data addresses, since those are already on the + * IPMP meta-interface. + */ + while (fgets(buf, statb.st_size, hfp) != NULL) { + if (ntok(buf) == 0) continue; - /* Reset the config command */ - (void) snprintf(syscmd, sizeof (syscmd), "%s %s %s ", - USR_SBIN_IFCONFIG, ifinst, fstr); - - /* No parsing if this is first interface of its kind */ - if (newattach) { - (void) strcat(syscmd, buf); - /* Classic if */ - if ((af & CONFIG_AF_INET) && (stdif == 1)) { - (void) strcat(syscmd, CFG_CMDS_STD); - } - rcm_log_message(RCM_TRACE1, "IP: New: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: Error: %s (%s): %s\n"), - syscmd, ifinst, strerror(errno)); - } + if (!ipmp) { + (void) ifconfig(ifinst, fstr, buf, stdif); continue; } - /* Parse the tokens to determine nature of the interface */ - for (n = 0; n < ntok; n++) { - /* Handle pathological failover cases */ - if (STREQ("-failover", tokens[n])) - nofailover++; - if (STREQ("failover", tokens[n])) - nofailover--; - - /* group attribute requires special processing */ - if (STREQ("group", tokens[n])) { - if (tokens[n + 1] != NULL) { - (void) snprintf(grpcmd, sizeof (grpcmd), - "%s %s %s %s %s", USR_SBIN_IFCONFIG, - ifinst, fstr, - tokens[n], tokens[n + 1]); - n++; /* skip next token */ - continue; - } - } - - /* Execute buffered command ? */ - if (STREQ("set", tokens[n]) || - STREQ("addif", tokens[n]) || - STREQ("removeif", tokens[n]) || - (n == (ntok -1))) { - - /* config command complete ? */ - if (n == (ntok -1)) { - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - } - - if (!cmdvalid) { - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - continue; - } - /* Classic if ? */ - if ((af & CONFIG_AF_INET) && (stdif == 1)) { - (void) strcat(syscmd, CFG_CMDS_STD); - } - - if (nofailover > 0) { - rcm_log_message(RCM_TRACE1, - "IP: Interim exec: %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: %s fail(%s): %s\n"), - syscmd, ifinst, - strerror(errno)); - } - } else { - /* Have mpathd configure the address */ - if (if_mpathd_configure(syscmd, ifinst, - af, ipmp) != 0) { - rcm_log_message(RCM_ERROR, - _("IP: %s fail(%s): %s\n"), - syscmd, ifinst, - strerror(errno)); - } - } - - /* Reset config command */ - (void) snprintf(syscmd, sizeof (syscmd), - "%s %s %s ", USR_SBIN_IFCONFIG, ifinst, - fstr); - nofailover = 0; - cmdvalid = 0; - } - /* - * Note: No explicit command validation is required - * since ifconfig to does it for us - */ - ADDSPACE(syscmd); - (void) strcat(syscmd, tokens[n]); - cmdvalid++; - } - } - - free(buf); - (void) fclose(fp); - - /* - * The group name needs to be set after all the test/nofailover - * addresses have been configured. Otherwise, if IPMP detects that the - * interface is failed, the addresses will be moved to a working - * interface before the '-failover' flag can be set. - */ - if (grpcmd[0] != '\0') { - rcm_log_message(RCM_TRACE1, "IP: set group name: %s\n", grpcmd); - if (rcm_exec_cmd(grpcmd) != 0) { - rcm_log_message(RCM_ERROR, _("IP: %s fail(%s): %s\n"), - grpcmd, ifinst, strerror(errno)); + (void) snprintf(ifparsebuf, ifparsebufsize, SBIN_IFPARSE + " -s %s %s", fstr, buf); + if ((ifparsefp = popen(ifparsebuf, "r")) == NULL) { + rcm_log_message(RCM_ERROR, + _("IP: cannot configure %s: popen \"%s\" " + "failed: %s\n"), ifinst, buf, strerror(errno)); + goto fail; } - } - rcm_log_message(RCM_TRACE1, "IP: if_ipmp_config(%s) success\n", ifinst); - - return (0); -} - -/* - * if_mpathd_configure() - Determine configuration disposition of the interface - */ -static int -if_mpathd_configure(char *syscmd, char *ifinst, int af, int ipmp) -{ - char *tokens[MAXARGS]; - char tspace[MAXLINE]; - int ntok; - char *addr; - char *from_lifname; - mpathd_cmd_t mpdcmd; - int n; - - rcm_log_message(RCM_TRACE1, "IP: if_mpathd_configure(%s): %s\n", - ifinst, syscmd); - - tokenize(syscmd, tokens, tspace, &ntok); - if (ntok <= 0) - return (0); - - addr = tokens[3]; /* by default, third token is valid address */ - for (n = 0; n < ntok; n++) { - if (STREQ("set", tokens[n]) || - STREQ("addif", tokens[n])) { - addr = tokens[n+1]; - if (addr == NULL) { /* invalid format */ - return (-1); - } else - break; + while (fgets(buf, statb.st_size, ifparsefp) != NULL) { + if (ntok(buf) > 0) + (void) ifconfig(ifinst, fstr, buf, stdif); } - } - /* Check std. commands or no failed over address */ - if (STREQ("removeif", addr) || STREQ("group", addr) || - ((from_lifname = get_mpathd_dest(addr, af)) == NULL)) { - rcm_log_message(RCM_TRACE1, - "IP: No failed-over host, exec %s\n", syscmd); - if (rcm_exec_cmd(syscmd) != 0) { + if (pclose(ifparsefp) == -1) { rcm_log_message(RCM_ERROR, - _("IP: %s failed(%s): %s\n"), - syscmd, ifinst, strerror(errno)); - return (-1); + _("IP: cannot configure %s: pclose \"%s\" " + "failed: %s\n"), ifinst, buf, strerror(errno)); + goto fail; } - return (0); - } - - /* Check for non-IPMP failover scenarios */ - if ((ipmp <= 0) && (from_lifname != NULL)) { - /* Address already hosted on another NIC, return */ - rcm_log_message(RCM_TRACE1, - "IP: Non-IPMP failed-over host(%s): %s\n", - ifinst, addr); - return (0); } +configured: /* - * Valid failed-over host; have mpathd set the original index + * Bring up the interface (it may already be up) + * + * Technically, since the boot scripts only unconditionally bring up + * IPv6 interfaces, we should only unconditionally bring up IPv6 here. + * However, if we don't bring up IPv4, and a legacy IPMP configuration + * without test addresses is being used, we will never bring the + * interface up even though we would've at boot. One fix is to check + * if the IPv4 hostname file contains data addresses that we would've + * brought up, but there's no simple way to do that. Given that it's + * rare to have persistent IP configuration for an interface that + * leaves it down, we cheap out and always bring it up for IPMP. */ - mpdcmd.cmd_command = MI_SETOINDEX; - (void) strcpy(mpdcmd.from_lifname, from_lifname); - (void) strcpy(mpdcmd.to_pifname, ifinst); - if (af & CONFIG_AF_INET6) { - mpdcmd.addr_family = AF_INET6; - } else { - mpdcmd.addr_family = AF_INET; - } - - /* Send command to in.mpathd(1M) */ - rcm_log_message(RCM_TRACE1, - "IP: Attempting setoindex from (%s) to (%s) ....\n", - from_lifname, ifinst); - - if (mpathd_send_cmd(&mpdcmd) < 0) { - rcm_log_message(RCM_TRACE1, - "IP: mpathd set original index unsuccessful: %s\n", - strerror(errno)); - return (-1); - } - - rcm_log_message(RCM_TRACE1, - "IP: setoindex success (%s) to (%s)\n", - from_lifname, ifinst); - - return (0); -} - -/* - * get_mpathd_dest() - Return current destination for lif; caller is - * responsible to free memory allocated for address - */ -static char * -get_mpathd_dest(char *addr, int family) -{ - int sock; - char *buf; - struct lifnum lifn; - struct lifconf lifc; - struct lifreq *lifrp; - sa_family_t af = AF_INET; /* IPv4 by default */ - int i; - struct lifreq lifreq; - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - struct hostent *hp; - char *ifname = NULL; - char *prefix = NULL; - char addrstr[INET6_ADDRSTRLEN]; - char ifaddr[INET6_ADDRSTRLEN]; - int err; - - if (addr == NULL) { - return (NULL); - } - - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s)\n", addr); - - if (family & CONFIG_AF_INET6) { - af = AF_INET6; - } else { - af = AF_INET; - } - - if ((sock = socket(af, SOCK_DGRAM, 0)) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: failure opening %s socket: %s\n"), - af == AF_INET6 ? "IPv6" : "IPv4", strerror(errno)); - return (NULL); - } - - lifn.lifn_family = af; - lifn.lifn_flags = 0; - if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCLGIFNUM failed: %s\n"), - strerror(errno)); - (void) close(sock); - return (NULL); - } - - if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { - rcm_log_message(RCM_ERROR, _("IP: calloc: %s\n"), - strerror(errno)); - (void) close(sock); - return (NULL); - } - - lifc.lifc_family = af; - lifc.lifc_flags = 0; - lifc.lifc_len = sizeof (struct lifreq) * lifn.lifn_count; - lifc.lifc_buf = buf; - - if (ioctl(sock, SIOCGLIFCONF, (char *)&lifc) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFCONF failed: %s\n"), - strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } + if ((af == AF_INET6 || ipmp) && !ifconfig(ifinst, fstr, "up", B_FALSE)) + goto fail; - /* Filter out prefix address from netmask */ - (void) strcpy(ifaddr, addr); - if ((prefix = strchr(ifaddr, '/')) != NULL) { - *prefix = '\0'; /* We care about the address part only */ - } + /* + * For IPv4, if a DHCP configuration file exists, have DHCP configure + * the interface. As with the boot scripts, this is done after the + * hostname files are processed so that configuration in those files + * (such as IPMP group names) will be applied first. + */ + if (af == AF_INET) { + char dhcpfile[MAXPATHLEN]; + char *dhcpbuf; + off_t i, dhcpsize; - /* Check for aliases */ - hp = getipnodebyname(ifaddr, af, AI_DEFAULT, &err); - if (hp) { - if (inet_ntop(af, (void *)hp->h_addr_list[0], - ifaddr, sizeof (ifaddr)) == NULL) { - /* Restore original address and use it */ - (void) strcpy(ifaddr, addr); - if ((prefix = strchr(ifaddr, '/')) != NULL) { - *prefix = '\0'; - } - } - freehostent(hp); - } - rcm_log_message(RCM_TRACE2, "IP: ifaddr(%s) = %s\n", addr, ifaddr); + (void) snprintf(dhcpfile, MAXPATHLEN, DHCPFILE_FMT, ifinst); + if (stat(dhcpfile, &statb) == -1) + goto out; - /* now search the interfaces */ - lifrp = lifc.lifc_req; - for (i = 0; i < lifn.lifn_count; i++, lifrp++) { - (void) strcpy(lifreq.lifr_name, lifrp->lifr_name); - /* Get the interface address for this interface */ - if (ioctl(sock, SIOCGLIFADDR, (char *)&lifreq) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCGLIFADDR: %s\n"), strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } - - if (af == AF_INET6) { - sin6 = (struct sockaddr_in6 *)&lifreq.lifr_addr; - if (inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, - addrstr, sizeof (addrstr)) == NULL) { - continue; - } - } else { - sin = (struct sockaddr_in *)&lifreq.lifr_addr; - if (inet_ntop(AF_INET, (void *)&sin->sin_addr, - addrstr, sizeof (addrstr)) == NULL) { - continue; - } + if ((dhcpbuf = copylist(dhcpfile, &dhcpsize)) == NULL) { + rcm_log_message(RCM_ERROR, _("IP: cannot read " + "(%s): %s\n"), dhcpfile, strerror(errno)); + goto fail; } - if (STREQ(addrstr, ifaddr)) { - /* Allocate memory to hold interface name */ - if ((ifname = (char *)malloc(LIFNAMSIZ)) == NULL) { - rcm_log_message(RCM_ERROR, - _("IP: malloc: %s\n"), strerror(errno)); - free(buf); - (void) close(sock); - return (NULL); - } - - /* Copy the interface name */ - /* - * (void) memcpy(ifname, lifrp->lifr_name, - * sizeof (ifname)); - * ifname[sizeof (ifname) - 1] = '\0'; - */ - (void) strcpy(ifname, lifrp->lifr_name); - break; + /* + * The copylist() API converts \n's to \0's, but we want them + * to be spaces. + */ + if (dhcpsize > 0) { + for (i = 0; i < dhcpsize; i++) + if (dhcpbuf[i] == '\0') + dhcpbuf[i] = ' '; + dhcpbuf[dhcpsize - 1] = '\0'; } + (void) ifconfig(ifinst, CFG_DHCP_CMD, dhcpbuf, B_FALSE); + free(dhcpbuf); } - - (void) close(sock); +out: + free(ifparsebuf); free(buf); - - if (ifname == NULL) - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s): none\n", - addr); - else - rcm_log_message(RCM_TRACE2, "IP: get_mpathd_dest(%s): %s\n", - addr, ifname); - - return (ifname); -} - -static int -if_getcount(int af) -{ - int sock; - struct lifnum lifn; - - rcm_log_message(RCM_TRACE1, "IP: if_getcount\n"); - - if ((sock = socket(af, SOCK_DGRAM, 0)) == -1) { - rcm_log_message(RCM_ERROR, - _("IP: failure opening %s socket: %s\n"), - af == AF_INET6 ? "IPv6" : "IPv4", strerror(errno)); - return (-1); - } - - lifn.lifn_family = af; - lifn.lifn_flags = 0; - if (ioctl(sock, SIOCGLIFNUM, (char *)&lifn) < 0) { - rcm_log_message(RCM_ERROR, - _("IP: SIOCLGIFNUM failed: %s\n"), - strerror(errno)); - (void) close(sock); - return (-1); - } - (void) close(sock); - - rcm_log_message(RCM_TRACE1, "IP: if_getcount success: %d\n", - lifn.lifn_count); - - return (lifn.lifn_count); + rcm_log_message(RCM_TRACE1, "IP: if_config_inst(%s) success\n", ifinst); + return (0); +fail: + free(ifparsebuf); + free(buf); + rcm_log_message(RCM_ERROR, "IP: if_config_inst(%s) failure\n", ifinst); + return (-1); } /* - * tokenize() - turn a command line into tokens; caller is responsible to - * provide enough memory to hold all tokens + * ntok() - count the number of tokens in the provided buffer. */ -static void -tokenize(char *line, char **tokens, char *tspace, int *ntok) +static uint_t +ntok(const char *cp) { - char *cp; - char *sp; + uint_t ntok = 0; - sp = tspace; - cp = line; - for (*ntok = 0; *ntok < MAXARGS; (*ntok)++) { - tokens[*ntok] = sp; + for (;;) { while (ISSPACE(*cp)) cp++; + if (ISEOL(*cp)) break; + do { - *sp++ = *cp++; + cp++; } while (!ISSPACE(*cp) && !ISEOL(*cp)); - *sp++ = '\0'; + ntok++; + } + return (ntok); +} + +static boolean_t +ifconfig(const char *ifinst, const char *fstr, const char *buf, boolean_t stdif) +{ + char syscmd[MAX_RECONFIG_SIZE + MAXPATHLEN + 1]; + int status; + + (void) snprintf(syscmd, sizeof (syscmd), SBIN_IFCONFIG " %s %s %s", + ifinst, fstr, buf); + + if (stdif) + (void) strlcat(syscmd, CFG_CMDS_STD, sizeof (syscmd)); + + rcm_log_message(RCM_TRACE1, "IP: Exec: %s\n", syscmd); + if ((status = rcm_exec_cmd(syscmd)) != 0) { + if (WIFEXITED(status)) { + rcm_log_message(RCM_ERROR, _("IP: \"%s\" failed with " + "exit status %d\n"), syscmd, WEXITSTATUS(status)); + } else { + rcm_log_message(RCM_ERROR, _("IP: Error: %s: %s\n"), + syscmd, strerror(errno)); + } + return (B_FALSE); } + return (B_TRUE); } diff --git a/usr/src/cmd/svc/milestone/net-init b/usr/src/cmd/svc/milestone/net-init index 26b295dce9..7f0804af67 100644 --- a/usr/src/cmd/svc/milestone/net-init +++ b/usr/src/cmd/svc/milestone/net-init @@ -20,11 +20,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This is the second phase of TCP/IP configuration. The first part is # run by the svc:/network/physical service and includes configuring the # interfaces and setting the machine's hostname. The svc:/network/initial @@ -52,10 +50,11 @@ if [ -f /etc/inet/ipaddrsel.conf ]; then fi # -# Now that /usr is mounted, see if in.mpathd needs to be started by firing it -# up in "adopt" mode; if there are no interfaces it needs to manage, it will -# automatically exit. Note that it may already be running if we're not -# executing as part of system boot. +# If explicit IPMP groups are being used, in.mpathd will already be started. +# However, if TRACK_INTERFACES_ONLY_WITH_GROUPS=no and no explicit IPMP +# groups have been configured, then it still needs to be started. So, fire +# it up in "adopt" mode; if there are no interfaces it needs to manage, it +# will automatically exit. # /usr/bin/pgrep -x -u 0 -z `smf_zonename` in.mpathd >/dev/null 2>&1 || \ /usr/lib/inet/in.mpathd -a diff --git a/usr/src/cmd/svc/milestone/net-loopback b/usr/src/cmd/svc/milestone/net-loopback index 3bd5a0f525..d07afd4ada 100644 --- a/usr/src/cmd/svc/milestone/net-loopback +++ b/usr/src/cmd/svc/milestone/net-loopback @@ -20,10 +20,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" . /lib/svc/share/smf_include.sh @@ -36,14 +35,6 @@ smf_configure_ip || exit $SMF_EXIT_OK # -# Cause ifconfig to not automatically start in.mpathd when IPMP groups are -# configured. This is not strictly necessary but makes it so that in.mpathd -# will always be started explicitly from /lib/svc/method/net-init (the -# svc:/network/initial service), when we're sure that /usr is mounted. -# -SUNW_NO_MPATHD=; export SUNW_NO_MPATHD - -# # Before any interfaces are configured, we need to set the system # default IP forwarding behavior. This will be the setting for # interfaces that don't modify the per-interface setting with the diff --git a/usr/src/cmd/svc/milestone/net-physical b/usr/src/cmd/svc/milestone/net-physical index 8530806768..bc74c2a206 100644 --- a/usr/src/cmd/svc/milestone/net-physical +++ b/usr/src/cmd/svc/milestone/net-physical @@ -20,7 +20,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. @@ -38,22 +38,9 @@ # smf_configure_ip || exit $SMF_EXIT_OK -# Print warnings to console -warn_failed_ifs() { - echo "Failed to $1 interface(s): $2" >/dev/msglog -} - # Make sure that the libraries essential to this stage of booting can be found. LD_LIBRARY_PATH=/lib; export LD_LIBRARY_PATH -# -# Cause ifconfig to not automatically start in.mpathd when IPMP groups are -# configured. This is not strictly necessary but makes it so that in.mpathd -# will always be started explicitly from /etc/init.d/inetinit, when we're -# sure that /usr is mounted. -# -SUNW_NO_MPATHD=; export SUNW_NO_MPATHD - smf_netstrategy if smf_is_globalzone; then @@ -127,13 +114,18 @@ if [ "$interface_names" != "/etc/hostname.*[0-9]" ]; then IFS="$ORIGIFS" while [ $# -ge 2 ]; do shift - if [ $# -gt 1 -a "$2" != "/etc/hostname" ]; then - while [ $# -gt 1 -a "$1" != "/etc/hostname" ]; do - shift - done - else - inet_list="$inet_list $1" + intf_name=$1 + while [ $# -gt 1 -a "$2" != "/etc/hostname" ]; do + intf_name="$intf_name.$2" shift + done + shift + + read one rest < /etc/hostname.$intf_name + if [ "$one" = ipmp ]; then + ipmp_list="$ipmp_list $intf_name" + else + inet_list="$inet_list $intf_name" fi done fi @@ -151,17 +143,38 @@ if [ "$interface_names" != "/etc/hostname6.*[0-9]" ]; then IFS="$ORIGIFS" while [ $# -ge 2 ]; do shift - if [ $# -gt 1 -a "$2" != "/etc/hostname6" ]; then - while [ $# -gt 1 -a "$1" != "/etc/hostname6" ]; do - shift - done - else - inet6_list="$inet6_list $1" + intf_name=$1 + while [ $# -gt 1 -a "$2" != "/etc/hostname6" ]; do + intf_name="$intf_name.$2" shift + done + shift + + read one rest < /etc/hostname6.$intf_name + if [ "$one" = ipmp ]; then + ipmp6_list="$ipmp6_list $intf_name" + else + inet6_list="$inet6_list $intf_name" fi done fi +# +# Create all of the IPv4 IPMP interfaces. +# +if [ -n "$ipmp_list" ]; then + set -- $ipmp_list + while [ $# -gt 0 ]; do + if /sbin/ifconfig $1 ipmp; then + ipmp_created="$ipmp_created $1" + else + ipmp_failed="$ipmp_failed $1" + fi + shift + done + [ -n "$ipmp_failed" ] && warn_failed_ifs "create IPv4 IPMP" \ + "$ipmp_failed" +fi # # Step through the IPv4 interface list and try to plumb every interface. @@ -178,7 +191,7 @@ if [ -n "$inet_list" ]; then fi shift done - [ -n "$inet_failed" ] && warn_failed_ifs "plumb IPv4" $inet_failed + [ -n "$inet_failed" ] && warn_failed_ifs "plumb IPv4" "$inet_failed" fi # Run autoconf to connect to a WLAN if the interface is a wireless one @@ -209,7 +222,24 @@ if [ -n "$inet6_list" ]; then fi shift done - [ -n "$inet6_failed" ] && warn_failed_ifs "plumb IPv6" $inet6_failed + [ -n "$inet6_failed" ] && warn_failed_ifs "plumb IPv6" "$inet6_failed" +fi + +# +# Create all of the IPv6 IPMP interfaces. +# +if [ -n "$ipmp6_list" ]; then + set -- $ipmp6_list + while [ $# -gt 0 ]; do + if /sbin/ifconfig $1 inet6 ipmp; then + ipmp6_created="$ipmp6_created $1" + else + ipmp6_failed="$ipmp6_failed $1" + fi + shift + done + [ -n "$ipmp6_failed" ] && warn_failed_ifs "create IPv6 IPMP" \ + "$ipmp6_failed" fi if smf_is_globalzone; then @@ -224,49 +254,24 @@ if smf_is_globalzone; then fi # -# Process the /etc/hostname.* files of plumbed IPv4 interfaces. If an -# /etc/hostname file is not present or is empty, the ifconfig auto-dhcp -# / auto-revarp command will attempt to set the address, later. +# Process the /etc/hostname[6].* files for IPMP interfaces. Processing these +# before non-IPMP interfaces avoids accidental implicit IPMP group creation. +# +[ -n "$ipmp_created" ] && if_configure inet "IPMP" $ipmp_created +[ -n "$ipmp6_created" ] && if_configure inet6 "IPMP" $ipmp6_created + # -# If /etc/hostname.lo0 exists the loop below will do additional -# configuration of lo0. +# Process the /etc/hostname[6].* files for non-IPMP interfaces. # -if [ -n "$inet_plumbed" ]; then - i4s_fail= - echo "configuring IPv4 interfaces:\c" - set -- $inet_plumbed - while [ $# -gt 0 ]; do - inet_process_hostname /sbin/ifconfig $1 inet \ - </etc/hostname.$1 >/dev/null - [ $? != 0 ] && i4s_fail="$i4s_fail $1" - echo " $1\c" - shift - done - echo "." - [ -n "$i4s_fail" ] && warn_failed_ifs "configure IPv4" $i4s_fail -fi +[ -n "$inet_plumbed" ] && if_configure inet "" $inet_plumbed +[ -n "$inet6_plumbed" ] && if_configure inet6 "" $inet6_plumbed # -# Process the /etc/hostname6.* files of plumbed IPv6 interfaces. After -# processing the hostname6 file, bring the interface up. If -# /etc/hostname6.lo0 exists the loop below will do additional -# configuration of lo0. +# For the IPv4 and IPv6 interfaces that failed to plumb, find (or create) +# IPMP meta-interfaces to host their data addresses. # -if [ -n "$inet6_plumbed" ]; then - i6_fail= - echo "configuring IPv6 interfaces:\c" - set -- $inet6_plumbed - while [ $# -gt 0 ]; do - inet6_process_hostname /sbin/ifconfig $1 inet6 \ - </etc/hostname6.$1 >/dev/null && - /sbin/ifconfig $1 inet6 up - [ $? != 0 ] && i6_fail="$i6_fail $1" - echo " $1\c" - shift - done - echo "." - [ -n "$i6_fail" ] && warn_failed_ifs "configure IPv6" $i6_fail -fi +[ -n "$inet_failed" ] && move_addresses inet +[ -n "$inet6_failed" ] && move_addresses inet6 # Run DHCP if requested. Skip boot-configured interface. interface_names="`echo /etc/dhcp.*[0-9] 2>/dev/null`" @@ -326,7 +331,7 @@ if [ "$interface_names" != '/etc/dhcp.*[0-9]' ]; then done IFS="$ORIGIFS" unset ORIGIFS - [ -n "$i4d_fail" ] && warn_failed_ifs "configure IPv4 DHCP" $i4d_fail + [ -n "$i4d_fail" ] && warn_failed_ifs "configure IPv4 DHCP" "$i4d_fail" fi # In order to avoid bringing up the interfaces that have @@ -338,14 +343,6 @@ if [ "$_INIT_NET_STRATEGY" = "rarp" -o -z "$hostname" ]; then fi # -# Process IPv4 and IPv6 interfaces that failed to plumb. Find an -# alternative interface to host the addresses. -# -[ -n "$inet_failed" ] && move_addresses inet - -[ -n "$inet6_failed" ] && move_addresses inet6 - -# # If the /etc/defaultrouter file exists, process it now so that the next # stage of booting will have access to NFS. # diff --git a/usr/src/cmd/svc/shell/net_include.sh b/usr/src/cmd/svc/shell/net_include.sh index 51c87a40a8..71dc6a8256 100644 --- a/usr/src/cmd/svc/shell/net_include.sh +++ b/usr/src/cmd/svc/shell/net_include.sh @@ -20,13 +20,18 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. # All rights reserved. # +# Print warnings to console +warn_failed_ifs() { + echo "Failed to $1 interface(s):$2" >/dev/msglog +} + # # shcat file # Simulates cat in sh so it doesn't need to be on the root filesystem. @@ -41,20 +46,28 @@ shcat() { } # -# Inet_list, list of IPv4 interfaces. -# Inet_plumbed, list of plumbed IPv4 interfaces. -# Inet_failed, list of IPv4 interfaces that failed to plumb. -# Inet6_list, list of IPv6 interfaces. -# Inet6_plumbed, list of plumbed IPv6 interfaces. -# Inet6_failed, list of IPv6 interfaces that failed to plumb. +# inet_list list of IPv4 interfaces. +# inet6_list list of IPv6 interfaces. +# ipmp_list list of IPMP IPv4 interfaces. +# ipmp6_list list of IPMP IPv6 interfaces. +# inet_plumbed list of plumbed IPv4 interfaces. +# inet6_plumbed list of plumbed IPv6 interfaces. +# ipmp_created list of created IPMP IPv4 interfaces. +# ipmp6_created list of created IPMP IPv6 interfaces. +# inet_failed list of IPv4 interfaces that failed to plumb. +# inet6_failed list of IPv6 interfaces that failed to plumb. +# ipmp_failed list of IPMP IPv4 interfaces that failed to be created. +# ipmp6_failed list of IPMP IPv6 interfaces that failed to be created. # unset inet_list inet_plumbed inet_failed \ - inet6_list inet6_plumbed inet6_failed + inet6_list inet6_plumbed inet6_failed \ + ipmp_list ipmp_created ipmp_failed \ + ipmp6_list ipmp6_created ipmp6_failed + # # get_physical interface # -# Return physical interface corresponding to the given logical -# interface. +# Return physical interface corresponding to the given interface. # get_physical() { @@ -70,7 +83,7 @@ get_physical() # get_logical interface # # Return logical interface number. Zero will be returned -# if there is no explicit logical device number. +# if there is no explicit logical number. # get_logical() { @@ -89,19 +102,18 @@ get_logical() # # if_comp if1 if2 # -# Compare Interfaces. Do the physical interface names and logical interface +# Compare interfaces. Do the physical interface names and logical interface # numbers match? # if_comp() { - [ "`get_physical $1`" = "`get_physical $2`" ] && \ - [ `get_logical $1` -eq `get_logical $2` ] + physical_comp $1 $2 && [ `get_logical $1` -eq `get_logical $2` ] } - + # # physical_comp if1 if2 # -# Do the two devices share a physical interface? +# Do the two interfaces share a physical interface? # physical_comp() { @@ -129,19 +141,110 @@ in_list() } # -# get_group_from_hostname interface type +# get_inactive_ifname groupname +# +# Return the name of an inactive interface in `groupname', if one exists. +# +get_inactive_ifname() +{ + ORIGIFS="$IFS" + /sbin/ipmpstat -gP -o groupname,interfaces | + while IFS=: read groupname ifnames; do + # + # Skip other IPMP groups. + # + [ "$groupname" != "$1" ] && continue + + # + # Standby interfaces are always enclosed in ()'s, so look + # for the first interface name starting with a "(", and + # strip those off. + # + IFS=" " + for ifname in $ifnames; do + case "$ifname" in + '('*) IFS="()" + echo $ifname + IFS="$ORIGIFS" + return + ;; + *) ;; + esac + done + done + IFS="$ORIGIFS" +} + +# +# get_groupifname groupname +# +# Return the IPMP meta-interface name for the group, if it exists. +# +get_groupifname() +{ + /sbin/ipmpstat -gP -o groupname,group | while IFS=: read name ifname; do + if [ "$name" = "$1" ]; then + echo "$ifname" + return + fi + done +} + +# +# create_ipmp ifname groupname type +# +# Helper function for create_groupifname() that returns zero if it's able +# to create an IPMP interface of the specified type and place it in the +# specified group, or non-zero otherwise. +# +create_ipmp() +{ + /sbin/ifconfig $1 >/dev/null 2>&1 && return 1 + /sbin/ifconfig $1 inet6 >/dev/null 2>&1 && return 1 + /sbin/ifconfig $1 $3 ipmp group $2 2>/dev/null +} + +# +# create_groupifname groupname type +# +# Create an IPMP meta-interface name for the group. We only use this +# function if all of the interfaces in the group failed at boot and there +# were no /etc/hostname[6].<if> files for the IPMP meta-interface. +# +create_groupifname() +{ + # + # This is a horrible way to count from 0 to 999, but in sh and + # without necessarily having /usr mounted, what else can we do? + # + for a in "" 1 2 3 4 5 6 7 8 9; do + for b in 0 1 2 3 4 5 6 7 8 9; do + for c in 0 1 2 3 4 5 6 7 8 9; do + # strip leading zeroes + [ "$a" = "" ] && [ "$b" = 0 ] && b="" + if create_ipmp ipmp$a$b$c $1 $2; then + echo ipmp$a$b$c + return + fi + done + done + done +} + +# +# get_hostname_ipmpinfo interface type # -# Return all group settings from hostname file for a given interface. +# Return all requested IPMP keywords from hostname file for a given interface. # # Example: -# get_group_from_hostname hme0 inet +# get_hostname_ipmpinfo hme0 inet keyword [ keyword ... ] # -get_group_from_hostname() +get_hostname_ipmpinfo() { case "$2" in - inet) file=/etc/hostname.$1 + inet) file=/etc/hostname.$1 ;; - inet6) file=/etc/hostname6.$1 + inet6) file=/etc/hostname6.$1 ;; *) return @@ -150,16 +253,21 @@ get_group_from_hostname() [ -r "$file" ] || return + type=$2 + shift 2 + # - # Read through the hostname file looking for group settings - # There may be several group settings in the file. It is up - # to the caller to pick the right one (i.e. the last one). + # Read through the hostname file looking for the specified + # keywords. Since there may be several keywords that cancel + # each other out, the caller must post-process as appropriate. # while read line; do [ -z "$line" ] && continue - /sbin/ifparse -s "$2" $line - done < "$file" | while read one two three; do - [ "$one" = "group" ] && echo "$two" + /sbin/ifparse -s "$type" $line + done < "$file" | while read one two; do + for keyword in "$@"; do + [ "$one" = "$keyword" ] && echo "$one $two" + done done } @@ -174,7 +282,6 @@ get_group_from_hostname() get_group_for_type() { physical=`get_physical $1` - type=$2 group="" @@ -183,184 +290,77 @@ get_group_for_type() # the reason for the second while loop. # shift 2 - while [ $# -gt 0 ]; do - if if_comp "$physical" $1; then - get_group_from_hostname $1 $type + for ifname in "$@"; do + if if_comp "$physical" $ifname; then + get_hostname_ipmpinfo $ifname $type group fi - shift done | while :; do - read next || { + read keyword grname || { echo "$group" break } - group="$next" + group="$grname" done } # -# get_group interface [ configured | failed ] -# -# If there is both an inet and inet6 version of an interface, the group -# could be set in either set of hostname files. -# -# Inet6 is configured after inet, so if the group is set in both -# sets of hostname files, the inet6 file wins. -# -# The "configured" argument should be used to get the group for -# an interface that has been plumbed into the stack and configured. Use -# the "failed" argument to get the group for an interface that failed to -# plumb. -# -get_group() -{ - group="" - - case "$2" in - configured) - group=`get_group_for_type $1 inet6 $inet6_plumbed` - ;; - failed) - group=`get_group_for_type $1 inet6 $inet6_list` - ;; - *) - return - ;; - esac - - if [ -z "$group" ]; then - if [ "$2" = configured ]; then - group=`get_group_for_type $1 inet $inet_plumbed` - else - group=`get_group_for_type $1 inet $inet_list` - fi - fi - - echo $group -} - -# -# get_standby_from_hostname interface type -# -# Return any "standby" or "-standby" flags in the hostname file. -# -# Example: -# get_standby_from_hostname hme0 inet6 -# -# -get_standby_from_hostname() -{ - case "$2" in - inet) file=/etc/hostname.$1 - ;; - inet6) file=/etc/hostname6.$1 - ;; - *) - return - ;; - esac - - [ -r "$file" ] || return - - # - # There may be several instances of the "standby" and - # "-standby" flags in the hostname file. It is up to - # the caller to pick the correct one. - # - while read line; do - [ -z "$line" ] && continue - /sbin/ifparse -s "$2" $line - done < "$file" | while read one two; do - [ "$one" = "standby" ] || [ "$one" = "-standby" ] \ - && echo "$one" - done -} - -# -# get_standby_for_type interface type plumbed_list +# get_standby_for_type interface type list # # Look through the set of hostname files associated with the same physical -# interface as "interface", and determine whether they would configure -# the interface as a standby interface. +# interface as "interface", and print the standby value ("standby", +# "-standby", or nothing). Only hostname files associated with the +# physical interface or logical interface zero can set this flag. # get_standby_for_type() { - physical=`get_physical $1` type=$2 - final="" - # - # The last "standby" or "-standby" flag is the one that counts, - # which is the reason for the second while loop. + # The last setting of "standby" or "-standby" is the one that + # counts, which is the reason for the second while loop. # shift 2 - while [ $# -gt 0 ]; do - if [ "`get_physical $1`" = "$physical" ]; then - get_standby_from_hostname $1 $type + for ifname in "$@"; do + if if_comp "$physical" $ifname; then + get_hostname_ipmpinfo $ifname $type standby -standby fi - shift done | while :; do - read next || { - echo "$final" + read keyword || { + echo "$iftype" break } - final="$next" + iftype="$keyword" done } # -# is_standby interface +# get_group interface # -# Determine whether a configured interface is a standby interface. -# -# Both the inet and inet6 hostname file sets must be checked. -# If "standby" or "-standby" is set in the inet6 hostname file set, -# don't bother looking at the inet set. +# If there is both an inet and inet6 version of an interface, the group +# could be set in either set of hostname files. Since inet6 is configured +# after inet, if there's a setting in both files, inet6 wins. # -is_standby() +get_group() { - standby=`get_standby_for_type $1 inet6 $inet6_plumbed` - - if [ -z "$standby" ]; then - standby=`get_standby_for_type $1 inet $inet_plumbed` - fi - - # The return value is the value of the following test. - [ "$standby" = "standby" ] + group=`get_group_for_type $1 inet6 $inet6_list` + [ -z "$group" ] && group=`get_group_for_type $1 inet $inet_list` + echo $group } # -# get_alternate interface plumbed_list -# -# Look for a plumbed interface in the same group as "interface". -# A standby interface is preferred over a non-standby interface. +# is_standby interface # -# Example: -# get_alternate hme0 $inet_plumbed +# If there is both an inet and inet6 version of an interface, the +# "standby" or "-standby" flag could be set in either set of hostname +# files. Since inet6 is configured after inet, if there's a setting in +# both files, inet6 wins. # -get_alternate() +is_standby() { - mygroup=`get_group $1 failed` - [ -z "$mygroup" ] && return - - maybe="" - - shift - while [ $# -gt 0 ]; do - group=`get_group $1 configured` - if [ "$group" = "$mygroup" ]; then - if is_standby $1; then - get_physical $1 - return - else - [ -z "$maybe" ] && maybe=$1 - fi - fi - shift - done - - get_physical $maybe + standby=`get_standby_for_type $1 inet6 $inet6_list` + [ -z "$standby" ] && standby=`get_standby_for_type $1 inet $inet_list` + [ "$standby" = "standby" ] } # @@ -394,7 +394,7 @@ doDHCPhostname() # # If there is only line in an hostname file we assume it contains # the old style address which results in the interface being brought up -# and the netmask and broadcast address being set. +# and the netmask and broadcast address being set ($inet_oneline_epilogue). # # If there are multiple lines we assume the file contains a list of # commands to the processor with neither the implied bringing up of the @@ -403,6 +403,8 @@ doDHCPhostname() # Return non-zero if any command fails so that the caller may alert # users to errors in the configuration. # +inet_oneline_epilogue="netmask + broadcast + up" + inet_process_hostname() { if doDHCPhostname $2; then @@ -418,7 +420,7 @@ inet_process_hostname() ifcmds="" retval=0 - while read line; do + while read one rest; do if [ -n "$ifcmds" ]; then # # This handles the first N-1 @@ -427,7 +429,14 @@ inet_process_hostname() $* $ifcmds || retval=$? multiple_lines=true fi - ifcmds="$line" + + # + # Strip out the "ipmp" keyword if it's the + # first token, since it's used to control + # interface creation, not configuration. + # + [ "$one" = ipmp ] && one= + ifcmds="$one $rest" done # @@ -437,8 +446,8 @@ inet_process_hostname() # [ -z "$ifcmds" ] && return $retval if [ $multiple_lines = false ]; then - # The traditional single-line hostname file. - ifcmds="$ifcmds netmask + broadcast + up" + # The traditional one-line hostname file. + ifcmds="$ifcmds $inet_oneline_epilogue" fi # @@ -470,7 +479,13 @@ inet_process_hostname() inet6_process_hostname() { retval=0 - while read ifcmds; do + while read one rest; do + # + # See comment in inet_process_hostname for details. + # + [ "$one" = ipmp ] && one= + ifcmds="$one $rest" + if [ -n "$ifcmds" ]; then $* $ifcmds || retval=$? fi @@ -479,10 +494,9 @@ inet6_process_hostname() } # -# Process interfaces that failed to plumb. Find an alternative -# interface to host the addresses. For IPv6, only static addresses -# defined in hostname6 files are moved, autoconfigured addresses are -# not moved. +# Process interfaces that failed to plumb. Find the IPMP meta-interface +# that should host the addresses. For IPv6, only static addresses defined +# in hostname6 files are moved, autoconfigured addresses are not moved. # # Example: # move_addresses inet6 @@ -491,35 +505,43 @@ move_addresses() { type="$1" eval "failed=\"\$${type}_failed\"" - eval "plumbed=\"\$${type}_plumbed\"" eval "list=\"\$${type}_list\"" - process_hostname="${type}_process_hostname" + process_func="${type}_process_hostname" processed="" if [ "$type" = inet ]; then - echo "moving addresses from failed IPv4 interfaces:\c" + typedesc="IPv4" zaddr="0.0.0.0" hostpfx="/etc/hostname" else - echo "moving addresses from failed IPv6 interfaces:\c" + typedesc="IPv6" zaddr="::" hostpfx="/etc/hostname6" fi - set -- $failed - while [ $# -gt 0 ]; do - in_list if_comp $1 $processed && { shift; continue; } - - alternate="`get_alternate $1 $plumbed`" - if [ -z "$alternate" ]; then - in_list physical_comp $1 $processed || { - echo " $1 (couldn't move, no" \ - "alternative interface)\c" - processed="$processed $1" + echo "Moving addresses from missing ${typedesc} interface(s):\c" \ + >/dev/msglog + + for ifname in $failed; do + in_list if_comp $ifname $processed && continue + + group=`get_group $ifname` + if [ -z "$group" ]; then + in_list physical_comp $ifname $processed || { + echo " $ifname (not moved -- not" \ + "in an IPMP group)\c" >/dev/msglog + processed="$processed $ifname" } - shift continue fi + + # + # Lookup the IPMP meta-interface name. If one doesn't exist, + # create it. + # + grifname=`get_groupifname $group` + [ -z "$grifname" ] && grifname=`create_groupifname $group $type` + # # The hostname files are processed twice. In the first # pass, we are looking for all commands that apply @@ -528,7 +550,7 @@ move_addresses() # whether the address represents a failover address # or not until we've read all the files associated with the # interface. - + # # In the first pass through the hostname files, all # additional logical interface commands are removed. # The remaining commands are concatenated together and @@ -541,19 +563,18 @@ move_addresses() # the embedded "set" command set the address later. # /sbin/ifparse -f $type ` - for item in $list; do - if_comp $1 $item && \ - $process_hostname /sbin/ifparse \ - $type < $hostpfx.$item - done | while read three four; do - [ "$three" != addif ] && \ - echo "$three $four \c" - done` | while read one two; do - [ -z "$one" ] && continue - line="addif $zaddr $one $two" - /sbin/ifconfig $alternate $type \ - -standby $line >/dev/null - done + for item in $list; do + if_comp $ifname $item && $process_func \ + /sbin/ifparse $type < $hostpfx.$item + done | while read three four; do + [ "$three" != addif ] && echo "$three $four \c" + done` | while read one two; do + [ -z "$one" ] && continue + [ "$one $two" = "$inet_oneline_epilogue" ] && \ + continue + line="addif $zaddr $one $two" + /sbin/ifconfig $grifname $type $line >/dev/null + done # # In the second pass, look for the the "addif" commands @@ -561,22 +582,75 @@ move_addresses() # commands are not valid in logical interface hostname # files. # - if [ "$1" = "`get_physical $1`" ]; then - $process_hostname /sbin/ifparse -f $type \ - <$hostpfx.$1 | while read one two; do - [ "$one" = addif ] && \ - /sbin/ifconfig $alternate $type -standby \ - addif $two >/dev/null + if [ "$ifname" = "`get_physical $ifname`" ]; then + $process_func /sbin/ifparse -f $type < $hostpfx.$ifname \ + | while read one two; do + [ "$one" = addif ] && \ + /sbin/ifconfig $grifname $type \ + addif $two >/dev/null done fi - in_list physical_comp $1 $processed || { - echo " $1 (moved to $alternate)\c" - processed="$processed $1" + # + # Check if this was an active interface in the group. If so, + # activate another IP interface (if possible) + # + is_standby $ifname || inactive=`get_inactive_ifname $group` + [ -n "$inactive" ] && /sbin/ifconfig $inactive $type -standby + + in_list physical_comp $ifname $processed || { + processed="$processed $ifname" + echo " $ifname (moved to $grifname\c" > /dev/msglog + if [ -n "$inactive" ]; then + echo " and cleared 'standby' on\c" > /dev/msglog + echo " $inactive to compensate\c" > /dev/msglog + fi + echo ")\c" > /dev/msglog } + inactive="" + done + echo "." >/dev/msglog +} + +# +# if_configure type class interface_list +# +# Configure all of the interfaces of type `type' (e.g., "inet6") in +# `interface_list' according to their /etc/hostname[6].* files. `class' +# describes the class of interface (e.g., "IPMP"), as a diagnostic aid. +# For inet6 interfaces, the interface is also brought up. +# +if_configure() +{ + fail= + type=$1 + class=$2 + process_func=${type}_process_hostname + shift 2 + + if [ "$type" = inet ]; then + desc="IPv4" + hostpfx="/etc/hostname" + else + desc="IPv6" + hostpfx="/etc/hostname6" + fi + [ -n "$class" ] && desc="$class $desc" + + echo "configuring $desc interfaces:\c" + while [ $# -gt 0 ]; do + $process_func /sbin/ifconfig $1 $type < $hostpfx.$1 >/dev/null + if [ $? != 0 ]; then + fail="$fail $1" + elif [ "$type" = inet6 ]; then + /sbin/ifconfig $1 inet6 up || fail="$fail $1" + fi + echo " $1\c" shift done echo "." + + [ -n "$fail" ] && warn_failed_ifs "configure $desc" "$fail" } # diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 46b2b5a958..dc90957dfa 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -615,13 +615,10 @@ const struct ioc { { (uint_t)SIOCSIPSECONFIG, "SIOCSIPSECONFIG", NULL }, { (uint_t)SIOCDIPSECONFIG, "SIOCDIPSECONFIG", NULL }, { (uint_t)SIOCLIPSECONFIG, "SIOCLIPSECONFIG", NULL }, - { (uint_t)SIOCLIFFAILOVER, "SIOCLIFFAILOVER", "lifreq" }, - { (uint_t)SIOCLIFFAILBACK, "SIOCLIFFAILBACK", "lifreq" }, - { (uint_t)SIOCSIPMPFAILBACK, "SIOCSIPMPFAILBACK", NULL }, + { (uint_t)SIOCGLIFBINDING, "SIOCGLIFBINDING", "lifreq" }, { (uint_t)SIOCSLIFGROUPNAME, "SIOCSLIFGROUPNAME", "lifreq" }, { (uint_t)SIOCGLIFGROUPNAME, "SIOCGLIFGROUPNAME", "lifreq" }, - { (uint_t)SIOCGLIFOINDEX, "SIOCGLIFOINDEX", "lifreq" }, - { (uint_t)SIOCSLIFOINDEX, "SIOCSLIFOINDEX", "lifreq" }, + { (uint_t)SIOCGLIFGROUPINFO, "SIOCGLIFGROUPINFO", "lifgroupinfo" }, { (uint_t)SIOCGDSTINFO, "SIOCGDSTINFO", NULL }, { (uint_t)SIOCGIP6ADDRPOLICY, "SIOCGIP6ADDRPOLICY", NULL }, { (uint_t)SIOCSIP6ADDRPOLICY, "SIOCSIP6ADDRPOLICY", NULL }, diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c index edc610559d..8165f64f99 100644 --- a/usr/src/cmd/truss/print.c +++ b/usr/src/cmd/truss/print.c @@ -19,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #define _SYSCALL32 /* make 32-bit compat headers visible */ #include <stdio.h> @@ -73,6 +70,7 @@ #include <netinet/tcp.h> #include <netinet/udp.h> #include <netinet/sctp.h> +#include <net/route.h> #include <sys/utrap.h> #include <sys/lgrp_user.h> #include <sys/door.h> @@ -1749,6 +1747,8 @@ prt_sol(private_t *pri, int raw, long val) { if (val == SOL_SOCKET) { outstring(pri, "SOL_SOCKET"); + } else if (val == SOL_ROUTE) { + outstring(pri, "SOL_ROUTE"); } else { const struct protoent *p; struct protoent res; @@ -1826,6 +1826,18 @@ sol_optname(private_t *pri, long val) #undef CBSIZE } +const char * +route_optname(private_t *pri, long val) +{ + switch (val) { + case RT_AWARE: + return ("RT_AWARE"); + default: + (void) snprintf(pri->code_buf, sizeof (pri->code_buf), + "0x%lx", val); + return (pri->code_buf); + } +} const char * tcp_optname(private_t *pri, long val) @@ -1918,6 +1930,8 @@ prt_son(private_t *pri, int raw, long val) switch (pri->sys_args[1]) { case SOL_SOCKET: outstring(pri, sol_optname(pri, val)); break; + case SOL_ROUTE: outstring(pri, route_optname(pri, val)); + break; case IPPROTO_TCP: outstring(pri, tcp_optname(pri, val)); break; case IPPROTO_UDP: outstring(pri, udp_optname(pri, val)); diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c index 72b6ce5c76..fb8f540cb5 100644 --- a/usr/src/cmd/zoneadmd/vplat.c +++ b/usr/src/cmd/zoneadmd/vplat.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2397,6 +2397,7 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id, */ char buffer[INET6_ADDRSTRLEN]; void *addr; + const char *nomatch = "no matching subnet found in netmasks(4)"; if (af == AF_INET) addr = &((struct sockaddr_in *) @@ -2405,14 +2406,23 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id, addr = &((struct sockaddr_in6 *) (&lifr.lifr_addr))->sin6_addr; - /* Find out what netmask interface is going to be using */ + /* + * Find out what netmask the interface is going to be using. + * If we just brought up an IPMP data address on an underlying + * interface above, the address will have already migrated, so + * the SIOCGLIFNETMASK won't be able to find it (but we need + * to bring the address up to get the actual netmask). Just + * omit printing the actual netmask in this corner-case. + */ if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 || - inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL) - goto bad; - zerror(zlogp, B_FALSE, - "WARNING: %s: no matching subnet found in netmasks(4) for " - "%s; using default of %s.", - lifr.lifr_name, addrstr4, buffer); + inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL) { + zerror(zlogp, B_FALSE, "WARNING: %s; using default.", + nomatch); + } else { + zerror(zlogp, B_FALSE, + "WARNING: %s: %s: %s; using default of %s.", + lifr.lifr_name, nomatch, addrstr4, buffer); + } } /* |
