summaryrefslogtreecommitdiff
path: root/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c')
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c1013
1 files changed, 433 insertions, 580 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
index aa6a99fb9c..e1e22e12d4 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "mpd_defs.h"
#include "mpd_tables.h"
@@ -46,7 +44,6 @@ static int lsock_v6; /* Listen socket to detect mpathd */
static int mibfd = -1; /* fd to get mib info */
static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
-boolean_t full_scan_required = _B_FALSE;
static uint_t last_initifs_time; /* Time when initifs was last run */
static char **argv0; /* Saved for re-exec on SIGHUP */
boolean_t handle_link_notifications = _B_TRUE;
@@ -58,10 +55,6 @@ static void check_if_removed(struct phyint_instance *pii);
static void select_test_ifs(void);
static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
-static void router_add_v4(mib2_ipRouteEntry_t *rp1,
- struct in_addr nexthop_v4);
-static void router_add_v6(mib2_ipv6RouteEntry_t *rp1,
- struct in6_addr nexthop_v6);
static void router_add_common(int af, char *ifname,
struct in6_addr nexthop);
static void init_router_targets();
@@ -74,17 +67,17 @@ static void check_addr_unique(struct phyint_instance *,
static void init_host_targets(void);
static void dup_host_targets(struct phyint_instance *desired_pii);
static void loopback_cmd(int sock, int family);
-static int poll_remove(int fd);
static boolean_t daemonize(void);
static int closefunc(void *, int);
static unsigned int process_cmd(int newfd, union mi_commands *mpi);
static unsigned int process_query(int fd, mi_query_t *miq);
+static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop);
static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
static unsigned int send_result(int fd, unsigned int error, int syserror);
-struct local_addr *laddr_list = NULL;
+addrlist_t *localaddrs;
/*
* Return the current time in milliseconds (from an arbitrary reference)
@@ -153,7 +146,7 @@ retry:
/*
* Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
*/
-static int
+int
poll_remove(int fd)
{
int i;
@@ -205,17 +198,11 @@ pii_process(int af, char *name, struct phyint_instance **pii_p)
break;
case PI_GROUP_CHANGED:
- /*
- * The phyint has changed group.
- */
- restore_phyint(pii->pii_phyint);
- /* FALLTHRU */
-
case PI_IFINDEX_CHANGED:
/*
- * Interface index has changed. Delete and
- * recreate the phyint as it is quite likely
- * the interface has been unplumbed and replumbed.
+ * Interface index or group membership has changed.
+ * Delete the old state and recreate based on the new
+ * state (it may no longer be in a group).
*/
pii_other = phyint_inst_other(pii);
if (pii_other != NULL)
@@ -249,51 +236,26 @@ pii_process(int af, char *name, struct phyint_instance **pii_p)
}
/*
- * This phyint is leaving the group. Try to restore the phyint to its
- * initial state. Return the addresses that belong to other group members,
- * to the group, and take back any addresses owned by this phyint
- */
-void
-restore_phyint(struct phyint *pi)
-{
- if (pi->pi_group == phyint_anongroup)
- return;
-
- /*
- * Move everthing to some other member in the group.
- * The phyint has changed group in the kernel. But we
- * have yet to do it in our tables.
- */
- if (!pi->pi_empty)
- (void) try_failover(pi, FAILOVER_TO_ANY);
- /*
- * Move all addresses owned by 'pi' back to pi, from each
- * of the other members of the group
- */
- (void) try_failback(pi);
-}
-
-/*
* Scan all interfaces to detect changes as well as new and deleted interfaces
*/
static void
initifs()
{
- int n;
+ int i, nlifr;
int af;
char *cp;
char *buf;
- int numifs;
+ int sockfd;
+ uint64_t flags;
struct lifnum lifn;
struct lifconf lifc;
+ struct lifreq lifreq;
struct lifreq *lifr;
struct logint *li;
struct phyint_instance *pii;
struct phyint_instance *next_pii;
- char pi_name[LIFNAMSIZ + 1];
- boolean_t exists;
- struct phyint *pi;
- struct local_addr *next;
+ struct phyint_group *pg, *next_pg;
+ char pi_name[LIFNAMSIZ + 1];
if (debug & D_PHYINT)
logdebug("initifs: Scanning interfaces\n");
@@ -301,13 +263,9 @@ initifs()
last_initifs_time = getcurrenttime();
/*
- * Free the laddr_list before collecting the local addresses.
+ * Free the existing local address list; we'll build a new list below.
*/
- while (laddr_list != NULL) {
- next = laddr_list->next;
- free(laddr_list);
- laddr_list = next;
- }
+ addrlist_free(&localaddrs);
/*
* Mark the interfaces so that we can find phyints and logints
@@ -326,122 +284,142 @@ initifs()
}
}
+ /*
+ * As above, mark groups so that we can detect IPMP interfaces which
+ * have been removed from the kernel. Also, delete the group address
+ * list since we'll iteratively recreate it below.
+ */
+ for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
+ pg->pg_in_use = _B_FALSE;
+ addrlist_free(&pg->pg_addrs);
+ }
+
lifn.lifn_family = AF_UNSPEC;
- lifn.lifn_flags = LIFC_ALLZONES;
+ lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
+again:
if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
- logperror("initifs: ioctl (get interface numbers)");
+ logperror("initifs: ioctl (get interface count)");
return;
}
- numifs = lifn.lifn_count;
+ /*
+ * Pad the interface count to detect when additional interfaces have
+ * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
+ */
+ lifn.lifn_count += 4;
- buf = (char *)calloc(numifs, sizeof (struct lifreq));
- if (buf == NULL) {
+ if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
logperror("initifs: calloc");
return;
}
lifc.lifc_family = AF_UNSPEC;
- lifc.lifc_flags = LIFC_ALLZONES;
- lifc.lifc_len = numifs * sizeof (struct lifreq);
+ lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
+ lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
lifc.lifc_buf = buf;
if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
- /*
- * EINVAL is commonly encountered, when things change
- * underneath us rapidly, (eg. at boot, when new interfaces
- * are plumbed successively) and the kernel finds the buffer
- * size we passed as too small. We will retry again
- * when we see the next routing socket msg, or at worst after
- * IF_SCAN_INTERVAL ms.
- */
- if (errno != EINVAL) {
- logperror("initifs: ioctl"
- " (get interface configuration)");
- }
+ logperror("initifs: ioctl (get interface configuration)");
free(buf);
return;
}
- lifr = (struct lifreq *)lifc.lifc_req;
-
/*
- * For each lifreq returned by SIOGGLIFCONF, call pii_process()
- * and get the state of the corresponding phyint_instance. If it is
- * successful, then call logint_init_from_k() to get the state of the
- * logint.
+ * If every lifr_req slot is taken, then additional interfaces must
+ * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
+ * Recalculate to make sure we didn't miss any interfaces.
*/
- for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
- int sockfd;
- struct local_addr *taddr;
- struct sockaddr_in *sin;
- struct sockaddr_in6 *sin6;
- struct lifreq lifreq;
+ nlifr = lifc.lifc_len / sizeof (struct lifreq);
+ if (nlifr >= lifn.lifn_count) {
+ free(buf);
+ goto again;
+ }
+ /*
+ * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
+ * global list of addresses, phyint groups, phyints, and logints.
+ */
+ for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) {
af = lifr->lifr_addr.ss_family;
-
- /*
- * Collect all local addresses.
- */
sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
- (void) memset(&lifreq, 0, sizeof (lifreq));
- (void) strlcpy(lifreq.lifr_name, lifr->lifr_name,
- sizeof (lifreq.lifr_name));
+ (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
if (errno != ENXIO)
logperror("initifs: ioctl (SIOCGLIFFLAGS)");
continue;
}
+ flags = lifreq.lifr_flags;
+
+ /*
+ * If the address is IFF_UP, add it to the local address list.
+ * (We ignore addresses that aren't IFF_UP since another node
+ * might legitimately have that address IFF_UP.)
+ */
+ if (flags & IFF_UP) {
+ (void) addrlist_add(&localaddrs, lifr->lifr_name, flags,
+ &lifr->lifr_addr);
+ }
/*
- * Add the interface address to laddr_list.
- * Another node might have the same IP address which is up.
- * In that case, it is appropriate to use the address as a
- * target, even though it is also configured (but not up) on
- * the local system.
- * Hence,the interface address is not added to laddr_list
- * unless it is IFF_UP.
+ * If this address is on an IPMP meta-interface, update our
+ * phyint_group information (either by recording that group
+ * still exists or creating a new group), and track what
+ * group the address is part of.
*/
- if (lifreq.lifr_flags & IFF_UP) {
- taddr = malloc(sizeof (struct local_addr));
- if (taddr == NULL) {
- logperror("initifs: malloc");
+ if (flags & IFF_IPMP) {
+ if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) {
+ if (errno != ENXIO)
+ logperror("initifs: ioctl "
+ "(SIOCGLIFGROUPNAME)");
continue;
}
- if (af == AF_INET) {
- sin = (struct sockaddr_in *)&lifr->lifr_addr;
- IN6_INADDR_TO_V4MAPPED(&sin->sin_addr,
- &taddr->addr);
- } else {
- sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
- taddr->addr = sin6->sin6_addr;
+
+ pg = phyint_group_lookup(lifreq.lifr_groupname);
+ if (pg == NULL) {
+ pg = phyint_group_create(lifreq.lifr_groupname);
+ if (pg == NULL) {
+ logerr("initifs: cannot create group "
+ "%s\n", lifreq.lifr_groupname);
+ continue;
+ }
+ phyint_group_insert(pg);
+ }
+ pg->pg_in_use = _B_TRUE;
+
+ /*
+ * Add this to the group's list of data addresses.
+ */
+ if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags,
+ &lifr->lifr_addr)) {
+ logerr("initifs: insufficient memory to track "
+ "data address information for %s\n",
+ lifr->lifr_name);
}
- taddr->next = laddr_list;
- laddr_list = taddr;
+ continue;
}
/*
- * Need to pass a phyint name to pii_process. Insert the
- * null where the ':' IF_SEPARATOR is found in the logical
- * name.
+ * This isn't an address on an IPMP meta-interface, so it's
+ * either on an underlying interface or not related to any
+ * group. Update our phyint and logint information (via
+ * pii_process() and logint_init_from_k()) -- but first,
+ * convert the logint name to a phyint name so we can call
+ * pii_process().
*/
(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
*cp = '\0';
- exists = pii_process(af, pi_name, &pii);
- if (exists) {
+ if (pii_process(af, pi_name, &pii)) {
/* The phyint is fine. So process the logint */
logint_init_from_k(pii, lifr->lifr_name);
check_addr_unique(pii, &lifr->lifr_addr);
}
-
}
-
free(buf);
/*
- * Scan for phyints and logints that have disappeared from the
+ * Scan for groups, phyints and logints that have disappeared from the
* kernel, and delete them.
*/
for (pii = phyint_instances; pii != NULL; pii = next_pii) {
@@ -449,70 +427,31 @@ initifs()
check_if_removed(pii);
}
+ for (pg = phyint_groups; pg != NULL; pg = next_pg) {
+ next_pg = pg->pg_next;
+ if (!pg->pg_in_use) {
+ phyint_group_delete(pg);
+ continue;
+ }
+ /*
+ * Refresh the group's state. This is necessary since the
+ * group's state is defined by the set of usable interfaces in
+ * the group, and an interface is considered unusable if all
+ * of its addresses are down. When an address goes down/up,
+ * the RTM_DELADDR/RTM_NEWADDR brings us through here.
+ */
+ phyint_group_refresh_state(pg);
+ }
+
/*
* Select a test address for sending probes on each phyint instance
*/
select_test_ifs();
/*
- * Handle link up/down notifications from the NICs.
+ * Handle link up/down notifications.
*/
process_link_state_changes();
-
- for (pi = phyints; pi != NULL; pi = pi->pi_next) {
- /*
- * If this is a case of group failure, we don't have much
- * to do until the group recovers again.
- */
- if (GROUP_FAILED(pi->pi_group))
- continue;
-
- /*
- * Try/Retry any pending failovers / failbacks, that did not
- * not complete, or that could not be initiated previously.
- * This implements the 3 invariants described in the big block
- * comment at the beginning of probe.c
- */
- if (pi->pi_flags & IFF_INACTIVE) {
- if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
- (void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
- } else {
- struct phyint_instance *pii;
-
- /*
- * Skip LINK UP interfaces which are not capable
- * of probing.
- */
- pii = pi->pi_v4;
- if (pii == NULL ||
- (LINK_UP(pi) && !PROBE_CAPABLE(pii))) {
- pii = pi->pi_v6;
- if (pii == NULL ||
- (LINK_UP(pi) && !PROBE_CAPABLE(pii)))
- continue;
- }
-
- /*
- * It is possible that the phyint has started
- * receiving packets, after it has been marked
- * PI_FAILED. Don't initiate failover, if the
- * phyint has started recovering. failure_state()
- * captures this check. A similar logic is used
- * for failback/repair case.
- */
- if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
- (failure_state(pii) == PHYINT_FAILURE)) {
- (void) try_failover(pi, FAILOVER_NORMAL);
- } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
- if (try_failback(pi) != IPMP_FAILURE) {
- (void) change_lif_flags(pi, IFF_FAILED,
- _B_FALSE);
- /* Per state diagram */
- pi->pi_empty = 0;
- }
- }
- }
- }
}
/*
@@ -569,7 +508,7 @@ check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
* The probe socket is closed on each interface instance, and the
* interface state set to PI_OFFLINE.
*/
-static void
+void
stop_probing(struct phyint *pi)
{
struct phyint_instance *pii;
@@ -631,7 +570,6 @@ select_test_ifs(void)
struct logint *li;
struct logint *probe_logint;
boolean_t target_scan_reqd = _B_FALSE;
- struct target *tg;
int rating;
if (debug & D_PHYINT)
@@ -645,8 +583,8 @@ select_test_ifs(void)
probe_logint = NULL;
/*
- * An interface that is offline, should not be probed.
- * Offline interfaces should always in PI_OFFLINE state,
+ * An interface that is offline should not be probed.
+ * IFF_OFFLINE interfaces should always be PI_OFFLINE
* unless some other entity has set the offline flag.
*/
if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
@@ -659,6 +597,15 @@ select_test_ifs(void)
stop_probing(pii->pii_phyint);
}
continue;
+ } else {
+ /*
+ * If something cleared IFF_OFFLINE (e.g., by accident
+ * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
+ * inherently racy), the phyint may still be offline.
+ * Just ignore it.
+ */
+ if (pii->pii_phyint->pi_state == PI_OFFLINE)
+ continue;
}
li = pii->pii_probe_logint;
@@ -776,17 +723,6 @@ select_test_ifs(void)
phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
}
- if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
- tg = pii->pii_targets;
- if (tg != NULL)
- target_delete(tg);
- assert(pii->pii_targets == NULL);
- assert(pii->pii_target_next == NULL);
- assert(pii->pii_ntargets == 0);
- target_create(pii, probe_logint->li_dstaddr,
- _B_TRUE);
- }
-
/*
* If no targets are currently known for this phyint
* we need to call init_router_targets. Since
@@ -806,15 +742,16 @@ select_test_ifs(void)
}
/*
- * Check the interface list for any interfaces that are marked
- * PI_FAILED but no longer enabled to send probes, and call
- * phyint_check_for_repair() to see if the link now indicates that the
- * interface should be repaired. Also see the state diagram in
+ * Scan the interface list for any interfaces that are PI_FAILED or
+ * PI_NOTARGETS but no longer enabled to send probes, and call
+ * phyint_check_for_repair() to see if the link state indicates that
+ * the interface should be repaired. Also see the state diagram in
* mpd_probe.c.
*/
for (pi = phyints; pi != NULL; pi = pi->pi_next) {
- if (pi->pi_state == PI_FAILED &&
- !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
+ if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) &&
+ (pi->pi_state == PI_FAILED ||
+ pi->pi_state == PI_NOTARGETS)) {
phyint_check_for_repair(pi);
}
}
@@ -875,15 +812,14 @@ check_testconfig(void)
pi->pi_v6->pii_probe_logint->li_dupaddr)
li = pi->pi_v6->pii_probe_logint;
- if (li != NULL) {
- if (!pi->pi_duptaddrmsg_printed) {
- (void) pr_addr(li->li_phyint_inst->pii_af,
- li->li_addr, abuf, sizeof (abuf));
- logerr("Test address %s is not unique in "
- "group; disabling probe-based failure "
- "detection on %s\n", abuf, pi->pi_name);
- pi->pi_duptaddrmsg_printed = 1;
- }
+ if (li != NULL && li->li_dupaddr) {
+ if (pi->pi_duptaddrmsg_printed)
+ continue;
+ logerr("Test address %s is not unique in group; "
+ "disabling probe-based failure detection on %s\n",
+ pr_addr(li->li_phyint_inst->pii_af,
+ li->li_addr, abuf, sizeof (abuf)), pi->pi_name);
+ pi->pi_duptaddrmsg_printed = 1;
continue;
}
@@ -915,10 +851,10 @@ check_config(void)
boolean_t v6_in_group;
/*
- * All phyints of a group must be homogenous to ensure that
- * failover or failback can be done. If any phyint in a group
- * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
- * Do a similar check for IPv6.
+ * All phyints of a group must be homogeneous to ensure that they can
+ * take over for one another. If any phyint in a group has IPv4
+ * plumbed, check that all phyints have IPv4 plumbed. Do a similar
+ * check for IPv6.
*/
for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
if (pg == phyint_anongroup)
@@ -949,9 +885,9 @@ check_config(void)
if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
if (!pi->pi_cfgmsg_printed) {
- logerr("NIC %s of group %s is"
- " not plumbed for IPv4 and may"
- " affect failover capability\n",
+ logerr("IP interface %s in group %s is"
+ " not plumbed for IPv4, affecting"
+ " IPv4 connectivity\n",
pi->pi_name,
pi->pi_group->pg_name);
pi->pi_cfgmsg_printed = 1;
@@ -959,9 +895,9 @@ check_config(void)
} else if (v6_in_group == _B_TRUE &&
pi->pi_v6 == NULL) {
if (!pi->pi_cfgmsg_printed) {
- logerr("NIC %s of group %s is"
- " not plumbed for IPv6 and may"
- " affect failover capability\n",
+ logerr("IP interface %s in group %s is"
+ " not plumbed for IPv6, affecting"
+ " IPv6 connectivity\n",
pi->pi_name,
pi->pi_group->pg_name);
pi->pi_cfgmsg_printed = 1;
@@ -974,10 +910,10 @@ check_config(void)
* error recovery message
*/
if (pi->pi_cfgmsg_printed) {
- logerr("NIC %s is now consistent with "
- "group %s and failover capability "
- "is restored\n", pi->pi_name,
- pi->pi_group->pg_name);
+ logerr("IP interface %s is now"
+ " consistent with group %s "
+ " and connectivity is restored\n",
+ pi->pi_name, pi->pi_group->pg_name);
pi->pi_cfgmsg_printed = 0;
}
}
@@ -1117,8 +1053,8 @@ run_timeouts(void)
static int eventpipe_read = -1; /* Used for synchronous signal delivery */
static int eventpipe_write = -1;
-static boolean_t cleanup_started = _B_FALSE;
- /* Don't write to eventpipe if in cleanup */
+boolean_t cleanup_started = _B_FALSE; /* true if we're going away */
+
/*
* Ensure that signals are processed synchronously with the rest of
* the code by just writing a one character signal number on the pipe.
@@ -1228,7 +1164,7 @@ in_signal(int fd)
"Number of probes sent %lld\n"
"Number of probe acks received %lld\n"
"Number of probes/acks lost %lld\n"
- "Number of valid unacknowled probes %lld\n"
+ "Number of valid unacknowledged probes %lld\n"
"Number of ambiguous probe acks received %lld\n",
AF_STR(pii->pii_af), pii->pii_name,
sent, acked, lost, unacked, unknown);
@@ -1321,12 +1257,20 @@ setup_rtsock(int af)
{
int s;
int flags;
+ int aware = RTAW_UNDER_IPMP;
s = socket(PF_ROUTE, SOCK_RAW, af);
if (s == -1) {
logperror("setup_rtsock: socket PF_ROUTE");
exit(1);
}
+
+ if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) {
+ logperror("setup_rtsock: setsockopt RT_AWARE");
+ (void) close(s);
+ exit(1);
+ }
+
if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
logperror("setup_rtsock: fcntl F_GETFL");
(void) close(s);
@@ -1347,8 +1291,7 @@ setup_rtsock(int af)
/*
* Process an RTM_IFINFO message received on a routing socket.
* The return value indicates whether a full interface scan is required.
- * Link up/down notifications from the NICs are reflected in the
- * IFF_RUNNING flag.
+ * Link up/down notifications are reflected in the IFF_RUNNING flag.
* If just the state of the IFF_RUNNING interface flag has changed, a
* a full interface scan isn't required.
*/
@@ -1400,7 +1343,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type)
/*
* We want to try and avoid doing a full interface scan for
- * link state notifications from the NICs, as indicated
+ * link state notifications from the datalink layer, as indicated
* by the state of the IFF_RUNNING flag. If just the
* IFF_RUNNING flag has changed state, the link state changes
* are processed without a full scan.
@@ -1441,25 +1384,7 @@ process_rtm_ifinfo(if_msghdr_t *ifm, int type)
* types.
*/
if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
- phyint_newtype(pi);
-
- /*
- * If IFF_INACTIVE has been set, then no data addresses should be
- * hosted on the interface. If IFF_INACTIVE has been cleared, then
- * move previously failed-over addresses back to it, provided it is
- * not failed. For details, see the state diagram in mpd_probe.c.
- */
- if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
- if (pii->pii_flags & IFF_INACTIVE) {
- if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
- (void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
- } else {
- if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
- pi->pi_empty = 0;
- (void) try_failback(pi);
- }
- }
- }
+ phyint_changed(pi);
/* Has just the IFF_RUNNING flag changed state ? */
if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
@@ -1620,22 +1545,24 @@ update_router_list(int fd)
t_scalar_t prim;
tor = (struct T_optmgmt_req *)&buf;
-
tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
tor->OPT_offset = sizeof (struct T_optmgmt_req);
tor->OPT_length = sizeof (struct opthdr);
tor->MGMT_flags = T_CURRENT;
+ /*
+ * Note: we use the special level value below so that IP will return
+ * us information concerning IRE_MARK_TESTHIDDEN routes.
+ */
req = (struct opthdr *)&tor[1];
- req->level = MIB2_IP; /* any MIB2_xxx value ok here */
+ req->level = EXPER_IP_AND_TESTHIDDEN;
req->name = 0;
req->len = 0;
ctlbuf.buf = (char *)&buf;
ctlbuf.len = tor->OPT_length + tor->OPT_offset;
ctlbuf.maxlen = sizeof (buf);
- flags = 0;
- if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
+ if (putmsg(fd, &ctlbuf, NULL, 0) == -1) {
logperror("update_router_list: putmsg(ctl)");
return (_B_FALSE);
}
@@ -1689,7 +1616,8 @@ update_router_list(int fd)
case T_OPTMGMT_ACK:
toa = &buf.uprim.optmgmt_ack;
optp = (struct opthdr *)&toa[1];
- if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
+ if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) +
+ sizeof (struct opthdr))) {
logerr("update_router_list: ctlbuf.len %d\n",
ctlbuf.len);
return (_B_FALSE);
@@ -1707,7 +1635,7 @@ update_router_list(int fd)
return (_B_FALSE);
}
- /* Process the T_OPGMGMT_ACK below */
+ /* Process the T_OPTMGMT_ACK below */
assert(prim == T_OPTMGMT_ACK);
switch (status) {
@@ -1717,9 +1645,8 @@ update_router_list(int fd)
* message. If this is the last message i.e EOD,
* return, else process the next T_OPTMGMT_ACK msg.
*/
- if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
- sizeof (struct opthdr)) && optp->len == 0 &&
- optp->name == 0 && optp->level == 0) {
+ if (optp->len == 0 && optp->name == 0 &&
+ optp->level == 0) {
/*
* This is the EOD message. Return
*/
@@ -1747,17 +1674,14 @@ update_router_list(int fd)
databuf.len = 0;
flags = 0;
for (;;) {
- status = getmsg(fd, NULL, &databuf, &flags);
- if (status >= 0) {
+ if (getmsg(fd, NULL, &databuf, &flags) >= 0)
break;
- } else if (errno == EINTR) {
+ if (errno == EINTR)
continue;
- } else {
- logperror("update_router_list:"
- " getmsg(data)");
- free(databuf.buf);
- return (_B_FALSE);
- }
+
+ logperror("update_router_list: getmsg(data)");
+ free(databuf.buf);
+ return (_B_FALSE);
}
if (optp->level == MIB2_IP &&
@@ -1777,18 +1701,35 @@ update_router_list(int fd)
/* NOTREACHED */
}
+
+/*
+ * Convert octet `octp' to a phyint name and store in `ifname'
+ */
+static void
+oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize)
+{
+ char *cp;
+ size_t len = MIN(octp->o_length, ifsize - 1);
+
+ (void) strncpy(ifname, octp->o_bytes, len);
+ ifname[len] = '\0';
+
+ if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL)
+ *cp = '\0';
+}
+
/*
- * Examine the IPv4 routing table, for default routers. For each default
- * router, populate the list of targets of each phyint that is on the same
- * link as the default router
+ * Examine the IPv4 routing table `buf' for possible targets. For each
+ * possible target, if it's on the same subnet an interface route, pass
+ * it to router_add_common() for further consideration.
*/
static void
ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
{
- mib2_ipRouteEntry_t *rp;
- mib2_ipRouteEntry_t *rp1;
- struct in_addr nexthop_v4;
- mib2_ipRouteEntry_t *endp;
+ char ifname[LIFNAMSIZ];
+ mib2_ipRouteEntry_t *rp, *rp1, *endp;
+ struct in_addr nexthop_v4;
+ struct in6_addr nexthop;
if (len == 0)
return;
@@ -1797,75 +1738,40 @@ ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
/*
- * Loop thru the routing table entries. Process any IRE_DEFAULT,
- * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
- * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
- * This is a potential target for probing, which we try to add
- * to the list of probe targets.
+ * Scan the routing table entries for any IRE_OFFSUBNET entries, and
+ * cross-reference them with the interface routes to determine if
+ * they're possible probe targets.
*/
for (rp = buf; rp < endp; rp++) {
if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
continue;
- /* Get the nexthop address. */
+ /* Get the nexthop address. */
nexthop_v4.s_addr = rp->ipRouteNextHop;
/*
- * Get the nexthop address. Then determine the outgoing
- * interface, by examining all interface IREs, and picking the
- * match. We don't look at the interface specified in the route
- * because we need to add the router target on all matching
- * interfaces anyway; the goal is to avoid falling back to
- * multicast when some interfaces are in the same subnet but
- * not in the same group.
+ * Rescan the routing table looking for interface routes that
+ * are on the same subnet, and try to add them. If they're
+ * not relevant (e.g., the interface route isn't part of an
+ * IPMP group, router_add_common() will discard).
*/
for (rp1 = buf; rp1 < endp; rp1++) {
- if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
+ if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) ||
+ rp1->ipRouteIfIndex.o_length == 0)
continue;
- }
- /*
- * Determine the interface IRE that matches the nexthop.
- * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask)
- */
- if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
- (nexthop_v4.s_addr & rp1->ipRouteMask)) {
- /*
- * We found the interface ire
- */
- router_add_v4(rp1, nexthop_v4);
- }
+ if ((rp1->ipRouteDest & rp1->ipRouteMask) !=
+ (nexthop_v4.s_addr & rp1->ipRouteMask))
+ continue;
+
+ oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ);
+ IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
+ router_add_common(AF_INET, ifname, nexthop);
}
}
}
void
-router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
-{
- char *cp;
- char ifname[LIFNAMSIZ + 1];
- struct in6_addr nexthop;
- int len;
-
- if (debug & D_TARGET)
- logdebug("router_add_v4()\n");
-
- len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
- (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
- ifname[len] = '\0';
-
- if (ifname[0] == '\0')
- return;
-
- cp = strchr(ifname, IF_SEPARATOR);
- if (cp != NULL)
- *cp = '\0';
-
- IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
- router_add_common(AF_INET, ifname, nexthop);
-}
-
-void
router_add_common(int af, char *ifname, struct in6_addr nexthop)
{
struct phyint_instance *pii;
@@ -1906,16 +1812,17 @@ router_add_common(int af, char *ifname, struct in6_addr nexthop)
}
/*
- * Examine the IPv6 routing table, for default routers. For each default
- * router, populate the list of targets of each phyint that is on the same
- * link as the default router
+ * Examine the IPv6 routing table `buf' for possible link-local targets, and
+ * pass any contenders to router_add_common() for further consideration.
*/
static void
ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
{
- mib2_ipv6RouteEntry_t *rp;
- mib2_ipv6RouteEntry_t *endp;
- struct in6_addr nexthop_v6;
+ struct lifreq lifr;
+ char ifname[LIFNAMSIZ];
+ char grname[LIFGRNAMSIZ];
+ mib2_ipv6RouteEntry_t *rp, *rp1, *endp;
+ struct in6_addr nexthop_v6;
if (debug & D_TARGET)
logdebug("ire_process_v6(len %d)\n", len);
@@ -1927,62 +1834,51 @@ ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
/*
- * Loop thru the routing table entries. Process any IRE_DEFAULT,
- * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
- * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
- * This is a potential target for probing, which we try to add
- * to the list of probe targets.
+ * Scan the routing table entries for any IRE_OFFSUBNET entries, and
+ * cross-reference them with the interface routes to determine if
+ * they're possible probe targets.
*/
for (rp = buf; rp < endp; rp++) {
- if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
+ if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) ||
+ !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop))
continue;
- /*
- * We have the outgoing interface in ipv6RouteIfIndex
- * if ipv6RouteIfindex.o_length is non-zero. The outgoing
- * interface must be present for link-local addresses. Since
- * we use only link-local addreses for probing, we don't
- * consider the case when the outgoing interface is not
- * known and we need to scan interface ires
- */
+ /* Get the nexthop address. */
nexthop_v6 = rp->ipv6RouteNextHop;
- if (rp->ipv6RouteIfIndex.o_length != 0) {
- /*
- * We already have the outgoing interface
- * in ipv6RouteIfIndex.
- */
- router_add_v6(rp, nexthop_v6);
- }
- }
-}
-
-void
-router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
-{
- char ifname[LIFNAMSIZ + 1];
- char *cp;
- int len;
-
- if (debug & D_TARGET)
- logdebug("router_add_v6()\n");
-
- len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
- (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
- ifname[len] = '\0';
+ /*
+ * The interface name should always exist for link-locals;
+ * we use it to map this entry to an IPMP group name.
+ */
+ if (rp->ipv6RouteIfIndex.o_length == 0)
+ continue;
- if (ifname[0] == '\0')
- return;
+ oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ);
+ if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 ||
+ strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) {
+ continue;
+ }
- cp = strchr(ifname, IF_SEPARATOR);
- if (cp != NULL)
- *cp = '\0';
+ /*
+ * Rescan the list of routes for interface routes, and add the
+ * above target to any interfaces in the same IPMP group.
+ */
+ for (rp1 = buf; rp1 < endp; rp1++) {
+ if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) ||
+ rp1->ipv6RouteIfIndex.o_length == 0) {
+ continue;
+ }
+ oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ);
+ (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
- router_add_common(AF_INET6, ifname, nexthop_v6);
+ if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 &&
+ strcmp(lifr.lifr_groupname, grname) == 0) {
+ router_add_common(AF_INET6, ifname, nexthop_v6);
+ }
+ }
+ }
}
-
-
/*
* Build a list of target routers, by scanning the routing tables.
* It is assumed that interface routes exist, to reach the routers.
@@ -2001,11 +1897,9 @@ init_router_targets(void)
for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
pi = pii->pii_phyint;
/*
- * Exclude ptp and host targets. Set tg_in_use to false,
- * only for router targets.
+ * Set tg_in_use to false only for router targets.
*/
- if (!pii->pii_targets_are_routers ||
- (pi->pi_flags & IFF_POINTOPOINT))
+ if (!pii->pii_targets_are_routers)
continue;
for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
@@ -2026,15 +1920,21 @@ init_router_targets(void)
}
for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
- if (!pii->pii_targets_are_routers ||
- (pi->pi_flags & IFF_POINTOPOINT))
+ pi = pii->pii_phyint;
+ if (!pii->pii_targets_are_routers)
continue;
for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
next_tg = tg->tg_next;
- if (!tg->tg_in_use) {
+ /*
+ * If the group has failed, it's likely the route was
+ * removed by an application affected by that failure.
+ * In that case, we keep the target so that we can
+ * reliably repair, at which point we'll refresh the
+ * target list again.
+ */
+ if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group))
target_delete(tg);
- }
}
}
}
@@ -2140,7 +2040,7 @@ getdefault(char *name)
* Command line options below
*/
boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */
-boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */
+boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */
static boolean_t adopt = _B_FALSE;
static boolean_t foreground = _B_FALSE;
@@ -2149,6 +2049,7 @@ main(int argc, char *argv[])
{
int i;
int c;
+ struct phyint *pi;
struct phyint_instance *pii;
char *value;
@@ -2173,14 +2074,15 @@ main(int argc, char *argv[])
if (user_failure_detection_time <= 0) {
user_failure_detection_time = FAILURE_DETECTION_TIME;
logerr("Invalid failure detection time %s, assuming "
- "default %d\n", value, user_failure_detection_time);
+ "default of %d ms\n", value,
+ user_failure_detection_time);
} else if (user_failure_detection_time <
MIN_FAILURE_DETECTION_TIME) {
user_failure_detection_time =
MIN_FAILURE_DETECTION_TIME;
logerr("Too small failure detection time of %s, "
- "assuming minimum %d\n", value,
+ "assuming minimum of %d ms\n", value,
user_failure_detection_time);
}
free(value);
@@ -2211,9 +2113,9 @@ main(int argc, char *argv[])
*/
value = getdefault("FAILBACK");
if (value != NULL) {
- if (strncasecmp(value, "yes", 3) == 0)
+ if (strcasecmp(value, "yes") == 0)
failback_enabled = _B_TRUE;
- else if (strncasecmp(value, "no", 2) == 0)
+ else if (strcasecmp(value, "no") == 0)
failback_enabled = _B_FALSE;
else
logerr("Invalid value for FAILBACK %s\n", value);
@@ -2229,9 +2131,9 @@ main(int argc, char *argv[])
*/
value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
if (value != NULL) {
- if (strncasecmp(value, "yes", 3) == 0)
+ if (strcasecmp(value, "yes") == 0)
track_all_phyints = _B_FALSE;
- else if (strncasecmp(value, "no", 2) == 0)
+ else if (strcasecmp(value, "no") == 0)
track_all_phyints = _B_TRUE;
else
logerr("Invalid value for "
@@ -2340,12 +2242,6 @@ main(int argc, char *argv[])
initifs();
- /* Inform kernel whether failback is enabled or disabled */
- if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) {
- logperror("main: ioctl (SIOCSIPMPFAILBACK)");
- exit(1);
- }
-
/*
* If we're operating in "adopt" mode and no interfaces need to be
* tracked, shut down (ifconfig(1M) will restart us on demand if
@@ -2379,6 +2275,7 @@ main(int argc, char *argv[])
process_rtsock(rtsock_v4, rtsock_v6);
break;
}
+
for (pii = phyint_instances; pii != NULL;
pii = pii->pii_next) {
if (pollfds[i].fd == pii->pii_probe_sock) {
@@ -2389,15 +2286,21 @@ main(int argc, char *argv[])
break;
}
}
+
+ for (pi = phyints; pi != NULL; pi = pi->pi_next) {
+ if (pi->pi_notes != 0 &&
+ pollfds[i].fd == dlpi_fd(pi->pi_dh)) {
+ (void) dlpi_recv(pi->pi_dh, NULL, NULL,
+ NULL, NULL, 0, NULL);
+ break;
+ }
+ }
+
if (pollfds[i].fd == lsock_v4)
loopback_cmd(lsock_v4, AF_INET);
else if (pollfds[i].fd == lsock_v6)
loopback_cmd(lsock_v6, AF_INET6);
}
- if (full_scan_required) {
- initifs();
- full_scan_required = _B_FALSE;
- }
}
/* NOTREACHED */
return (EXIT_SUCCESS);
@@ -2481,29 +2384,23 @@ static struct {
{ "MI_PING", sizeof (uint32_t) },
{ "MI_OFFLINE", sizeof (mi_offline_t) },
{ "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) },
- { "MI_SETOINDEX", sizeof (mi_setoindex_t) },
{ "MI_QUERY", sizeof (mi_query_t) }
};
/*
- * Commands received over the loopback interface come here. Currently
- * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
- * module. ifconfig only makes a connection, and closes it to check if
- * in.mpathd is running.
- * if_mpadm sends commands in the format specified by the mpathd_interface
- * structure.
+ * Commands received over the loopback interface come here (via libipmp).
*/
static void
loopback_cmd(int sock, int family)
{
int newfd;
ssize_t len;
+ boolean_t is_priv = _B_FALSE;
struct sockaddr_storage peer;
struct sockaddr_in *peer_sin;
struct sockaddr_in6 *peer_sin6;
socklen_t peerlen;
union mi_commands mpi;
- struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
char abuf[INET6_ADDRSTRLEN];
uint_t cmd;
int retval;
@@ -2528,10 +2425,11 @@ loopback_cmd(int sock, int family)
return;
}
peer_sin = (struct sockaddr_in *)&peer;
- if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
- (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
- (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
- abuf, sizeof (abuf));
+ is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED;
+ (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
+ abuf, sizeof (abuf));
+
+ if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) {
logerr("Attempt to connect from addr %s port %d\n",
abuf, ntohs(peer_sin->sin_port));
(void) close(newfd);
@@ -2551,11 +2449,10 @@ loopback_cmd(int sock, int family)
* talking to us.
*/
peer_sin6 = (struct sockaddr_in6 *)&peer;
- if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
- (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
- &loopback_addr))) {
- (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
- sizeof (abuf));
+ is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED;
+ (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
+ sizeof (abuf));
+ if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) {
logerr("Attempt to connect from addr %s port %d\n",
abuf, ntohs(peer_sin6->sin6_port));
(void) close(newfd);
@@ -2575,15 +2472,6 @@ loopback_cmd(int sock, int family)
len = read(newfd, &mpi, sizeof (mpi));
/*
- * ifconfig does not send any data. Just tests to see if mpathd
- * is already running.
- */
- if (len <= 0) {
- (void) close(newfd);
- return;
- }
-
- /*
* In theory, we can receive any sized message for a stream socket,
* but we don't expect that to happen for a small message over a
* loopback connection.
@@ -2591,6 +2479,8 @@ loopback_cmd(int sock, int family)
if (len < sizeof (uint32_t)) {
logerr("loopback_cmd: bad command format or read returns "
"partial data %d\n", len);
+ (void) close(newfd);
+ return;
}
cmd = mpi.mi_command;
@@ -2600,6 +2490,16 @@ loopback_cmd(int sock, int family)
return;
}
+ /*
+ * Only MI_PING and MI_QUERY can come from unprivileged sources.
+ */
+ if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) {
+ logerr("Unprivileged request from %s for privileged "
+ "command %s\n", abuf, commands[cmd].name);
+ (void) close(newfd);
+ return;
+ }
+
if (len < commands[cmd].size) {
logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
commands[cmd].name, commands[cmd].size, len);
@@ -2615,179 +2515,46 @@ loopback_cmd(int sock, int family)
(void) close(newfd);
}
-extern int global_errno; /* set by failover() or failback() */
-
/*
- * Process the offline, undo offline and set original index commands,
- * received from if_mpadm(1M)
+ * Process the commands received via libipmp.
*/
static unsigned int
process_cmd(int newfd, union mi_commands *mpi)
{
- uint_t nif = 0;
- uint32_t cmd;
struct phyint *pi;
- struct phyint *pi2;
- struct phyint_group *pg;
- boolean_t success;
- int error;
struct mi_offline *mio;
struct mi_undo_offline *miu;
- struct lifreq lifr;
- int ifsock;
- struct mi_setoindex *mis;
+ unsigned int retval;
- cmd = mpi->mi_command;
+ switch (mpi->mi_command) {
+ case MI_PING:
+ return (send_result(newfd, IPMP_SUCCESS, 0));
- switch (cmd) {
case MI_OFFLINE:
mio = &mpi->mi_ocmd;
- /*
- * Lookup the interface that needs to be offlined.
- * If it does not exist, return a suitable error.
- */
+
pi = phyint_lookup(mio->mio_ifname);
if (pi == NULL)
- return (send_result(newfd, IPMP_FAILURE, EINVAL));
-
- /*
- * Verify that the minimum redundancy requirements are met.
- * The multipathing group must have at least the specified
- * number of functional interfaces after offlining the
- * requested interface. Otherwise return a suitable error.
- */
- pg = pi->pi_group;
- nif = 0;
- if (pg != phyint_anongroup) {
- for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
- pi2 = pi2->pi_pgnext) {
- if ((pi2->pi_state == PI_RUNNING) ||
- (pg->pg_groupfailed &&
- !(pi2->pi_flags & IFF_OFFLINE)))
- nif++;
- }
- }
- if (nif < mio->mio_min_redundancy)
- return (send_result(newfd, IPMP_EMINRED, 0));
+ return (send_result(newfd, IPMP_EUNKIF, 0));
- /*
- * The order of operation is to set IFF_OFFLINE, followed by
- * failover. Setting IFF_OFFLINE ensures that no new ipif's
- * can be created. Subsequent failover moves everything on
- * the OFFLINE interface to some other functional interface.
- */
- success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
- if (success) {
- if (!pi->pi_empty) {
- error = try_failover(pi, FAILOVER_NORMAL);
- if (error != 0) {
- if (!change_lif_flags(pi, IFF_OFFLINE,
- _B_FALSE)) {
- logerr("process_cmd: couldn't"
- " clear OFFLINE flag on"
- " %s\n", pi->pi_name);
- /*
- * Offline interfaces should
- * not be probed.
- */
- stop_probing(pi);
- }
- return (send_result(newfd, error,
- global_errno));
- }
- }
- } else {
+ retval = phyint_offline(pi, mio->mio_min_redundancy);
+ if (retval == IPMP_FAILURE)
return (send_result(newfd, IPMP_FAILURE, errno));
- }
- /*
- * The interface is now Offline, so stop probing it.
- * Note that if_mpadm(1M) will down the test addresses,
- * after receiving a success reply from us. The routing
- * socket message will then make us close the socket used
- * for sending probes. But it is more logical that an
- * offlined interface must not be probed, even if it has
- * test addresses.
- */
- stop_probing(pi);
- return (send_result(newfd, IPMP_SUCCESS, 0));
+ return (send_result(newfd, retval, 0));
case MI_UNDO_OFFLINE:
miu = &mpi->mi_ucmd;
- /*
- * Undo the offline command. As usual lookup the interface.
- * Send an error if it does not exist or is not offline.
- */
- pi = phyint_lookup(miu->miu_ifname);
- if (pi == NULL || pi->pi_state != PI_OFFLINE)
- return (send_result(newfd, IPMP_FAILURE, EINVAL));
-
- /*
- * Reset the state of the interface based on the current link
- * state; if this phyint subsequently acquires a test address,
- * the state will be updated later as a result of the probes.
- */
- if (LINK_UP(pi))
- phyint_chstate(pi, PI_RUNNING);
- else
- phyint_chstate(pi, PI_FAILED);
-
- if (pi->pi_state == PI_RUNNING) {
- /*
- * Note that the success of MI_UNDO_OFFLINE is not
- * contingent on actually failing back; in the odd
- * case where we cannot do it here, we will try again
- * in initifs() since pi->pi_full will still be zero.
- */
- if (do_failback(pi) != IPMP_SUCCESS) {
- logdebug("process_cmd: cannot failback from "
- "%s during MI_UNDO_OFFLINE\n", pi->pi_name);
- }
- }
-
- /*
- * Clear the IFF_OFFLINE flag. We have to do this last
- * because do_failback() relies on it being set to decide
- * when to display messages.
- */
- (void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE);
-
- /*
- * Give the requestor time to configure test addresses
- * before complaining that they're missing.
- */
- pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
-
- return (send_result(newfd, IPMP_SUCCESS, 0));
-
- case MI_SETOINDEX:
- mis = &mpi->mi_scmd;
- /* Get the socket for doing ioctls */
- ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
-
- /*
- * Get index of new original interface.
- * The index is returned in lifr.lifr_index.
- */
- (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
- sizeof (lifr.lifr_name));
+ pi = phyint_lookup(miu->miu_ifname);
+ if (pi == NULL)
+ return (send_result(newfd, IPMP_EUNKIF, 0));
- if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
+ retval = phyint_undo_offline(pi);
+ if (retval == IPMP_FAILURE)
return (send_result(newfd, IPMP_FAILURE, errno));
- /*
- * Set new original interface index.
- * The new index was put into lifr.lifr_index by the
- * SIOCGLIFINDEX ioctl.
- */
- (void) strlcpy(lifr.lifr_name, mis->mis_lifname,
- sizeof (lifr.lifr_name));
-
- if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
- return (send_result(newfd, IPMP_FAILURE, errno));
-
- return (send_result(newfd, IPMP_SUCCESS, 0));
+ return (send_result(newfd, retval, 0));
case MI_QUERY:
return (process_query(newfd, &mpi->mi_qcmd));
@@ -2806,6 +2573,8 @@ process_cmd(int newfd, union mi_commands *mpi)
static unsigned int
process_query(int fd, mi_query_t *miq)
{
+ ipmp_addrinfo_t *adinfop;
+ ipmp_addrinfolist_t *adlp;
ipmp_groupinfo_t *grinfop;
ipmp_groupinfolist_t *grlp;
ipmp_grouplist_t *grlistp;
@@ -2815,6 +2584,19 @@ process_query(int fd, mi_query_t *miq)
unsigned int retval;
switch (miq->miq_inforeq) {
+ case IPMP_ADDRINFO:
+ retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr,
+ &adinfop);
+ if (retval != IPMP_SUCCESS)
+ return (send_result(fd, retval, errno));
+
+ retval = send_result(fd, IPMP_SUCCESS, 0);
+ if (retval == IPMP_SUCCESS)
+ retval = send_addrinfo(fd, adinfop);
+
+ ipmp_freeaddrinfo(adinfop);
+ return (retval);
+
case IPMP_GROUPLIST:
retval = getgrouplist(&grlistp);
if (retval != IPMP_SUCCESS)
@@ -2829,7 +2611,7 @@ process_query(int fd, mi_query_t *miq)
case IPMP_GROUPINFO:
miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
- retval = getgroupinfo(miq->miq_ifname, &grinfop);
+ retval = getgroupinfo(miq->miq_grname, &grinfop);
if (retval != IPMP_SUCCESS)
return (send_result(fd, retval, errno));
@@ -2854,6 +2636,11 @@ process_query(int fd, mi_query_t *miq)
return (retval);
case IPMP_SNAP:
+ /*
+ * Before taking the snapshot, sync with the kernel.
+ */
+ initifs();
+
retval = getsnap(&snap);
if (retval != IPMP_SUCCESS)
return (send_result(fd, retval, errno));
@@ -2883,6 +2670,13 @@ process_query(int fd, mi_query_t *miq)
if (retval != IPMP_SUCCESS)
goto out;
}
+
+ adlp = snap->sn_adinfolistp;
+ for (; adlp != NULL; adlp = adlp->adl_next) {
+ retval = send_addrinfo(fd, adlp->adl_adinfop);
+ if (retval != IPMP_SUCCESS)
+ goto out;
+ }
out:
ipmp_snap_free(snap);
return (retval);
@@ -2902,14 +2696,20 @@ static unsigned int
send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
{
ipmp_iflist_t *iflistp = grinfop->gr_iflistp;
+ ipmp_addrlist_t *adlistp = grinfop->gr_adlistp;
unsigned int retval;
retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
if (retval != IPMP_SUCCESS)
return (retval);
- return (ipmp_writetlv(fd, IPMP_IFLIST,
- IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
+ retval = ipmp_writetlv(fd, IPMP_IFLIST,
+ IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ return (ipmp_writetlv(fd, IPMP_ADDRLIST,
+ IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp));
}
/*
@@ -2919,7 +2719,31 @@ send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
static unsigned int
send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
{
- return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
+ ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp;
+ ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp;
+ unsigned int retval;
+
+ retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ retval = ipmp_writetlv(fd, IPMP_ADDRLIST,
+ IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p);
+ if (retval != IPMP_SUCCESS)
+ return (retval);
+
+ return (ipmp_writetlv(fd, IPMP_ADDRLIST,
+ IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p));
+}
+
+/*
+ * Send the address information pointed to by `adinfop' on file descriptor
+ * `fd'. Returns an IPMP error code.
+ */
+static unsigned int
+send_addrinfo(int fd, ipmp_addrinfo_t *adinfop)
+{
+ return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop));
}
/*
@@ -3109,3 +2933,32 @@ close_probe_socket(struct phyint_instance *pii, boolean_t polled)
pii->pii_probe_sock = -1;
pii->pii_basetime_inited = 0;
}
+
+boolean_t
+addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags,
+ struct sockaddr_storage *ssp)
+{
+ addrlist_t *addrp;
+
+ if ((addrp = malloc(sizeof (addrlist_t))) == NULL)
+ return (_B_FALSE);
+
+ (void) strlcpy(addrp->al_name, name, LIFNAMSIZ);
+ addrp->al_flags = flags;
+ addrp->al_addr = *ssp;
+ addrp->al_next = *addrsp;
+ *addrsp = addrp;
+ return (_B_TRUE);
+}
+
+void
+addrlist_free(addrlist_t **addrsp)
+{
+ addrlist_t *addrp, *next_addrp;
+
+ for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) {
+ next_addrp = addrp->al_next;
+ free(addrp);
+ }
+ *addrsp = NULL;
+}