diff options
author | Joshua M. Clulow <jmc@joyent.com> | 2013-04-26 16:57:57 -0700 |
---|---|---|
committer | Joshua M. Clulow <jmc@joyent.com> | 2013-04-26 23:58:05 +0000 |
commit | 5c437167fe84c344bed577aa1c3307307b5b986d (patch) | |
tree | 236cc3ec29f855c9aa53947d23bb542ad525c845 | |
parent | 1dc8db5bfa6e0561ff1cd92c60d8d0b0c805476b (diff) | |
download | illumos-joyent-5c437167fe84c344bed577aa1c3307307b5b986d.tar.gz |
OS-2001 disk-monitor should activate fault/fail indicators
-rw-r--r-- | manifest | 2 | ||||
-rw-r--r-- | usr/src/cmd/fm/modules/common/disk-lights/Makefile | 23 | ||||
-rw-r--r-- | usr/src/cmd/fm/modules/common/disk-lights/disk-lights.conf | 17 | ||||
-rw-r--r-- | usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c | 325 | ||||
-rw-r--r-- | usr/src/lib/fm/topo/libtopo/common/libtopo.h | 2 | ||||
-rw-r--r-- | usr/src/lib/fm/topo/libtopo/common/mapfile-vers | 1 | ||||
-rw-r--r-- | usr/src/lib/fm/topo/libtopo/common/topo_node.c | 52 |
7 files changed, 422 insertions, 0 deletions
@@ -5252,6 +5252,8 @@ f usr/lib/fm/fmd/ipmitopo 0555 root bin d usr/lib/fm/fmd/plugins 0755 root bin f usr/lib/fm/fmd/plugins/cpumem-retire.conf 0644 root bin f usr/lib/fm/fmd/plugins/cpumem-retire.so 0555 root bin +f usr/lib/fm/fmd/plugins/disk-lights.conf 0644 root bin +f usr/lib/fm/fmd/plugins/disk-lights.so 0555 root bin f usr/lib/fm/fmd/plugins/disk-monitor.conf 0644 root bin f usr/lib/fm/fmd/plugins/disk-monitor.so 0555 root bin f usr/lib/fm/fmd/plugins/disk-transport.conf 0644 root bin diff --git a/usr/src/cmd/fm/modules/common/disk-lights/Makefile b/usr/src/cmd/fm/modules/common/disk-lights/Makefile new file mode 100644 index 0000000000..d7c3bdadb7 --- /dev/null +++ b/usr/src/cmd/fm/modules/common/disk-lights/Makefile @@ -0,0 +1,23 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2013, Joyent, Inc. All rights reserved. +# + +MODULE = disk-lights +CLASS = common +SRCS = disk_lights.c + +include ../../Makefile.plugin + +LDLIBS += -ltopo +LDFLAGS += -L$(ROOT)/usr/lib/fm -R/usr/lib/fm diff --git a/usr/src/cmd/fm/modules/common/disk-lights/disk-lights.conf b/usr/src/cmd/fm/modules/common/disk-lights/disk-lights.conf new file mode 100644 index 0000000000..c0701dde08 --- /dev/null +++ b/usr/src/cmd/fm/modules/common/disk-lights/disk-lights.conf @@ -0,0 +1,17 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2013, Joyent, Inc. All rights reserved. +# + +subscribe fault.io.disk.* +subscribe fault.io.scsi.* diff --git a/usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c b/usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c new file mode 100644 index 0000000000..7eaa421559 --- /dev/null +++ b/usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c @@ -0,0 +1,325 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + +/* + * Disk Lights Agent (FMA) + * + * This Fault Management Daemon (fmd) module periodically scans the topology + * tree, enumerates all disks with associated fault indicators, and then + * synchronises the fault status of resources in the FMA Resource Cache with + * the indicators. In short: it turns the fault light on for befallen disks. + * + * Presently, we recognise associated fault indicators for disks by looking + * for the following structure in the topology tree: + * + * /bay=N + * | + * +---- /disk=0 <---------------- our Disk + * | + * +---- /bay=N?indicator=fail <---- the Fault Light + * \---- /bay=N?indicator=ident + * + * That is: a DISK node will have a parent BAY; that BAY will itself have + * child Facility nodes, one of which will be called "fail". If any of the + * above does not hold, we simply do nothing for this disk. + */ + +#include <string.h> +#include <strings.h> +#include <libnvpair.h> +#include <fm/libtopo.h> +#include <fm/topo_list.h> +#include <fm/topo_hc.h> +#include <fm/fmd_api.h> +#include <sys/fm/protocol.h> + + +typedef struct disk_lights { + fmd_hdl_t *dl_fmd; + uint64_t dl_poll_interval; + uint64_t dl_coalesce_interval; + id_t dl_timer; + boolean_t dl_triggered; +} disk_lights_t; + +static void disklights_topo(fmd_hdl_t *, topo_hdl_t *); +static void disklights_recv(fmd_hdl_t *, fmd_event_t *, nvlist_t *, + const char *); +static void disklights_timeout(fmd_hdl_t *, id_t, void *); + +static const fmd_hdl_ops_t fmd_ops = { + disklights_recv, /* fmdo_recv */ + disklights_timeout, /* fmdo_timeout */ + NULL, /* fmdo_close */ + NULL, /* fmdo_stats */ + NULL, /* fmdo_gc */ + NULL, /* fmdo_send */ + disklights_topo, /* fmdo_topo */ +}; + +/* + * POLL_INTERVAL is the period after which we perform an unsolicited poll + * to ensure we remain in sync with reality. + */ +#define DL_PROP_POLL_INTERVAL "poll-interval" + +/* + * COALESCE_INTERVAL is how long we wait after we are trigged by either a + * topology change or a relevant list.* event, in order to allow a series + * of events to coalesce. + */ +#define DL_PROP_COALESCE_INTERVAL "coalesce-interval" + +static const fmd_prop_t fmd_props[] = { + { DL_PROP_POLL_INTERVAL, FMD_TYPE_TIME, "5min" }, + { DL_PROP_COALESCE_INTERVAL, FMD_TYPE_TIME, "3s" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "Disk Lights Agent", + "1.0", + &fmd_ops, + fmd_props +}; + +/* + * Fetch the Facility Node properties (name, type) from the FMRI + * for this node, or return -1 if we can't. + */ +static int +get_facility_props(topo_hdl_t *hdl, tnode_t *node, char **facname, + char **factype) +{ + int e, ret = -1; + nvlist_t *fmri = NULL, *fnvl; + char *nn = NULL, *tt = NULL; + + if (topo_node_resource(node, &fmri, &e) != 0) + goto out; + + if (nvlist_lookup_nvlist(fmri, FM_FMRI_FACILITY, &fnvl) != 0) + goto out; + + if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_NAME, &nn) != 0) + goto out; + + if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_TYPE, &tt) != 0) + goto out; + + *facname = topo_hdl_strdup(hdl, nn); + *factype = topo_hdl_strdup(hdl, tt); + ret = 0; + +out: + nvlist_free(fmri); + return (ret); +} + +typedef struct dl_fault_walk_inner { + char *fwi_name; + uint32_t fwi_mode; +} dl_fault_walk_inner_t; + +static int +dl_fault_walk_inner(topo_hdl_t *thp, tnode_t *node, void *arg) +{ + dl_fault_walk_inner_t *fwi = arg; + char *facname = NULL, *factype = NULL; + int err; + + /* + * We're only interested in BAY children that are valid Facility Nodes. + */ + if (topo_node_flags(node) != TOPO_NODE_FACILITY || + get_facility_props(thp, node, &facname, &factype) != 0) { + goto out; + } + + if (strcmp(fwi->fwi_name, facname) != 0) + goto out; + + /* + * Attempt to set the LED mode appropriately. If this fails, give up + * and move on. + */ + (void) topo_prop_set_uint32(node, TOPO_PGROUP_FACILITY, TOPO_LED_MODE, + TOPO_PROP_MUTABLE, fwi->fwi_mode, &err); + +out: + topo_hdl_strfree(thp, facname); + topo_hdl_strfree(thp, factype); + return (TOPO_WALK_NEXT); +} + +static int +dl_fault_walk_outer(topo_hdl_t *thp, tnode_t *node, void *arg) +{ + disk_lights_t *dl = arg; + dl_fault_walk_inner_t fwi; + tnode_t *pnode; + int err, has_fault; + nvlist_t *fmri = NULL; + + bzero(&fwi, sizeof (fwi)); + + /* + * We are only looking for DISK nodes in the topology that have a parent + * BAY. + */ + if (strcmp(DISK, topo_node_name(node)) != 0 || + (pnode = topo_node_parent(node)) == NULL || + strcmp(BAY, topo_node_name(pnode)) != 0) { + return (TOPO_WALK_NEXT); + } + + /* + * Check to see if the Resource this FMRI describes is Faulty: + */ + if (topo_node_resource(node, &fmri, &err) != 0) + return (TOPO_WALK_NEXT); + has_fault = fmd_nvl_fmri_has_fault(dl->dl_fmd, fmri, + FMD_HAS_FAULT_RESOURCE, NULL); + nvlist_free(fmri); + + /* + * Walk the children of this BAY and flush out our fault status if + * we find an appropriate indicator node. + */ + fwi.fwi_name = "fail"; + fwi.fwi_mode = has_fault ? TOPO_LED_STATE_ON : TOPO_LED_STATE_OFF; + (void) topo_node_child_walk(thp, pnode, dl_fault_walk_inner, &fwi, + &err); + + return (TOPO_WALK_NEXT); +} + +/* + * Walk all of the topology nodes looking for DISKs that match the structure + * described in the overview. Once we find them, check their fault status + * and update their fault indiciator accordingly. + */ +static void +dl_examine_topo(disk_lights_t *dl) +{ + int err; + topo_hdl_t *thp = NULL; + topo_walk_t *twp = NULL; + + thp = fmd_hdl_topo_hold(dl->dl_fmd, TOPO_VERSION); + if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, dl_fault_walk_outer, + dl, &err)) == NULL) { + fmd_hdl_error(dl->dl_fmd, "failed to get topology: %s\n", + topo_strerror(err)); + goto out; + } + + if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) { + fmd_hdl_error(dl->dl_fmd, "failed to walk topology: %s\n", + topo_strerror(err)); + goto out; + } + +out: + if (twp != NULL) + topo_walk_fini(twp); + if (thp != NULL) + fmd_hdl_topo_rele(dl->dl_fmd, thp); +} + +static void +dl_trigger_enum(disk_lights_t *dl) +{ + /* + * If we're already on the short-poll coalesce timer, then return + * immediately. + */ + if (dl->dl_triggered == B_TRUE) + return; + dl->dl_triggered = B_TRUE; + + /* + * Replace existing poll timer with coalesce timer: + */ + if (dl->dl_timer != 0) + fmd_timer_remove(dl->dl_fmd, dl->dl_timer); + dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL, + dl->dl_coalesce_interval); +} + +static void +disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data) +{ + disk_lights_t *dl = fmd_hdl_getspecific(hdl); + + dl->dl_triggered = B_FALSE; + + dl_examine_topo(dl); + + /* + * Install the long-interval timer for the next poll. + */ + dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval); +} + +static void +disklights_topo(fmd_hdl_t *hdl, topo_hdl_t *thp) +{ + disk_lights_t *dl = fmd_hdl_getspecific(hdl); + + dl_trigger_enum(dl); +} + +static void +disklights_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, + const char *class) +{ + disk_lights_t *dl = fmd_hdl_getspecific(hdl); + + dl_trigger_enum(dl); +} + +void +_fmd_init(fmd_hdl_t *hdl) +{ + disk_lights_t *dl; + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) + return; + + dl = fmd_hdl_zalloc(hdl, sizeof (*dl), FMD_SLEEP); + fmd_hdl_setspecific(hdl, dl); + + /* + * Load Configuration: + */ + dl->dl_fmd = hdl; + dl->dl_poll_interval = fmd_prop_get_int64(hdl, DL_PROP_POLL_INTERVAL); + dl->dl_coalesce_interval = fmd_prop_get_int64(hdl, + DL_PROP_COALESCE_INTERVAL); + + /* + * Schedule the initial enumeration: + */ + dl_trigger_enum(dl); +} + +void +_fmd_fini(fmd_hdl_t *hdl) +{ + disk_lights_t *dl = fmd_hdl_getspecific(hdl); + + fmd_hdl_free(hdl, dl, sizeof (*dl)); +} diff --git a/usr/src/lib/fm/topo/libtopo/common/libtopo.h b/usr/src/lib/fm/topo/libtopo/common/libtopo.h index 9b7102f882..3ea35cdddd 100644 --- a/usr/src/lib/fm/topo/libtopo/common/libtopo.h +++ b/usr/src/lib/fm/topo/libtopo/common/libtopo.h @@ -137,6 +137,8 @@ extern boolean_t topo_method_supported(tnode_t *, const char *, topo_version_t); extern int topo_node_facility(topo_hdl_t *, tnode_t *, const char *, uint32_t, topo_faclist_t *, int *); +extern int topo_node_child_walk(topo_hdl_t *, tnode_t *, topo_walk_cb_t, + void *, int *); /* * Node flags: denotes type of node diff --git a/usr/src/lib/fm/topo/libtopo/common/mapfile-vers b/usr/src/lib/fm/topo/libtopo/common/mapfile-vers index 1cde0dee1f..b81f4fd7c6 100644 --- a/usr/src/lib/fm/topo/libtopo/common/mapfile-vers +++ b/usr/src/lib/fm/topo/libtopo/common/mapfile-vers @@ -126,6 +126,7 @@ SYMBOL_VERSION SUNWprivate { topo_node_asru; topo_node_asru_set; topo_node_bind; + topo_node_child_walk; topo_node_facility; topo_node_facbind; topo_node_flags; diff --git a/usr/src/lib/fm/topo/libtopo/common/topo_node.c b/usr/src/lib/fm/topo/libtopo/common/topo_node.c index 553bc851f0..00b828c49c 100644 --- a/usr/src/lib/fm/topo/libtopo/common/topo_node.c +++ b/usr/src/lib/fm/topo/libtopo/common/topo_node.c @@ -837,3 +837,55 @@ topo_node_walk_init(topo_hdl_t *thp, topo_mod_t *mod, tnode_t *node, return (wp); } + +/* + * Walk the direct children of the given node. + */ +int +topo_node_child_walk(topo_hdl_t *thp, tnode_t *pnode, topo_walk_cb_t cb_f, + void *arg, int *errp) +{ + int ret = TOPO_WALK_TERMINATE; + tnode_t *cnode; + + topo_node_hold(pnode); + + /* + * First Child: + */ + topo_node_lock(pnode); + cnode = topo_child_first(pnode); + topo_node_unlock(pnode); + + if (cnode == NULL) { + *errp = ETOPO_WALK_EMPTY; + ret = TOPO_WALK_ERR; + goto out; + } + + while (cnode != NULL) { + int iret; + + /* + * Call the walker callback: + */ + topo_node_hold(cnode); + iret = cb_f(thp, cnode, arg); + topo_node_rele(cnode); + if (iret != TOPO_WALK_NEXT) { + ret = iret; + break; + } + + /* + * Next child: + */ + topo_node_lock(pnode); + cnode = topo_child_next(pnode, cnode); + topo_node_unlock(pnode); + } + +out: + topo_node_rele(pnode); + return (ret); +} |