summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoshua M. Clulow <jmc@joyent.com>2013-04-26 16:57:57 -0700
committerJoshua M. Clulow <jmc@joyent.com>2013-04-26 23:58:05 +0000
commit5c437167fe84c344bed577aa1c3307307b5b986d (patch)
tree236cc3ec29f855c9aa53947d23bb542ad525c845
parent1dc8db5bfa6e0561ff1cd92c60d8d0b0c805476b (diff)
downloadillumos-joyent-5c437167fe84c344bed577aa1c3307307b5b986d.tar.gz
OS-2001 disk-monitor should activate fault/fail indicators
-rw-r--r--manifest2
-rw-r--r--usr/src/cmd/fm/modules/common/disk-lights/Makefile23
-rw-r--r--usr/src/cmd/fm/modules/common/disk-lights/disk-lights.conf17
-rw-r--r--usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c325
-rw-r--r--usr/src/lib/fm/topo/libtopo/common/libtopo.h2
-rw-r--r--usr/src/lib/fm/topo/libtopo/common/mapfile-vers1
-rw-r--r--usr/src/lib/fm/topo/libtopo/common/topo_node.c52
7 files changed, 422 insertions, 0 deletions
diff --git a/manifest b/manifest
index 9e45052588..0c351a8d87 100644
--- a/manifest
+++ b/manifest
@@ -5252,6 +5252,8 @@ f usr/lib/fm/fmd/ipmitopo 0555 root bin
d usr/lib/fm/fmd/plugins 0755 root bin
f usr/lib/fm/fmd/plugins/cpumem-retire.conf 0644 root bin
f usr/lib/fm/fmd/plugins/cpumem-retire.so 0555 root bin
+f usr/lib/fm/fmd/plugins/disk-lights.conf 0644 root bin
+f usr/lib/fm/fmd/plugins/disk-lights.so 0555 root bin
f usr/lib/fm/fmd/plugins/disk-monitor.conf 0644 root bin
f usr/lib/fm/fmd/plugins/disk-monitor.so 0555 root bin
f usr/lib/fm/fmd/plugins/disk-transport.conf 0644 root bin
diff --git a/usr/src/cmd/fm/modules/common/disk-lights/Makefile b/usr/src/cmd/fm/modules/common/disk-lights/Makefile
new file mode 100644
index 0000000000..d7c3bdadb7
--- /dev/null
+++ b/usr/src/cmd/fm/modules/common/disk-lights/Makefile
@@ -0,0 +1,23 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2013, Joyent, Inc. All rights reserved.
+#
+
+MODULE = disk-lights
+CLASS = common
+SRCS = disk_lights.c
+
+include ../../Makefile.plugin
+
+LDLIBS += -ltopo
+LDFLAGS += -L$(ROOT)/usr/lib/fm -R/usr/lib/fm
diff --git a/usr/src/cmd/fm/modules/common/disk-lights/disk-lights.conf b/usr/src/cmd/fm/modules/common/disk-lights/disk-lights.conf
new file mode 100644
index 0000000000..c0701dde08
--- /dev/null
+++ b/usr/src/cmd/fm/modules/common/disk-lights/disk-lights.conf
@@ -0,0 +1,17 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2013, Joyent, Inc. All rights reserved.
+#
+
+subscribe fault.io.disk.*
+subscribe fault.io.scsi.*
diff --git a/usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c b/usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c
new file mode 100644
index 0000000000..7eaa421559
--- /dev/null
+++ b/usr/src/cmd/fm/modules/common/disk-lights/disk_lights.c
@@ -0,0 +1,325 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Disk Lights Agent (FMA)
+ *
+ * This Fault Management Daemon (fmd) module periodically scans the topology
+ * tree, enumerates all disks with associated fault indicators, and then
+ * synchronises the fault status of resources in the FMA Resource Cache with
+ * the indicators. In short: it turns the fault light on for befallen disks.
+ *
+ * Presently, we recognise associated fault indicators for disks by looking
+ * for the following structure in the topology tree:
+ *
+ * /bay=N
+ * |
+ * +---- /disk=0 <---------------- our Disk
+ * |
+ * +---- /bay=N?indicator=fail <---- the Fault Light
+ * \---- /bay=N?indicator=ident
+ *
+ * That is: a DISK node will have a parent BAY; that BAY will itself have
+ * child Facility nodes, one of which will be called "fail". If any of the
+ * above does not hold, we simply do nothing for this disk.
+ */
+
+#include <string.h>
+#include <strings.h>
+#include <libnvpair.h>
+#include <fm/libtopo.h>
+#include <fm/topo_list.h>
+#include <fm/topo_hc.h>
+#include <fm/fmd_api.h>
+#include <sys/fm/protocol.h>
+
+
+typedef struct disk_lights {
+ fmd_hdl_t *dl_fmd;
+ uint64_t dl_poll_interval;
+ uint64_t dl_coalesce_interval;
+ id_t dl_timer;
+ boolean_t dl_triggered;
+} disk_lights_t;
+
+static void disklights_topo(fmd_hdl_t *, topo_hdl_t *);
+static void disklights_recv(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
+ const char *);
+static void disklights_timeout(fmd_hdl_t *, id_t, void *);
+
+static const fmd_hdl_ops_t fmd_ops = {
+ disklights_recv, /* fmdo_recv */
+ disklights_timeout, /* fmdo_timeout */
+ NULL, /* fmdo_close */
+ NULL, /* fmdo_stats */
+ NULL, /* fmdo_gc */
+ NULL, /* fmdo_send */
+ disklights_topo, /* fmdo_topo */
+};
+
+/*
+ * POLL_INTERVAL is the period after which we perform an unsolicited poll
+ * to ensure we remain in sync with reality.
+ */
+#define DL_PROP_POLL_INTERVAL "poll-interval"
+
+/*
+ * COALESCE_INTERVAL is how long we wait after we are trigged by either a
+ * topology change or a relevant list.* event, in order to allow a series
+ * of events to coalesce.
+ */
+#define DL_PROP_COALESCE_INTERVAL "coalesce-interval"
+
+static const fmd_prop_t fmd_props[] = {
+ { DL_PROP_POLL_INTERVAL, FMD_TYPE_TIME, "5min" },
+ { DL_PROP_COALESCE_INTERVAL, FMD_TYPE_TIME, "3s" },
+ { NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+ "Disk Lights Agent",
+ "1.0",
+ &fmd_ops,
+ fmd_props
+};
+
+/*
+ * Fetch the Facility Node properties (name, type) from the FMRI
+ * for this node, or return -1 if we can't.
+ */
+static int
+get_facility_props(topo_hdl_t *hdl, tnode_t *node, char **facname,
+ char **factype)
+{
+ int e, ret = -1;
+ nvlist_t *fmri = NULL, *fnvl;
+ char *nn = NULL, *tt = NULL;
+
+ if (topo_node_resource(node, &fmri, &e) != 0)
+ goto out;
+
+ if (nvlist_lookup_nvlist(fmri, FM_FMRI_FACILITY, &fnvl) != 0)
+ goto out;
+
+ if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_NAME, &nn) != 0)
+ goto out;
+
+ if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_TYPE, &tt) != 0)
+ goto out;
+
+ *facname = topo_hdl_strdup(hdl, nn);
+ *factype = topo_hdl_strdup(hdl, tt);
+ ret = 0;
+
+out:
+ nvlist_free(fmri);
+ return (ret);
+}
+
+typedef struct dl_fault_walk_inner {
+ char *fwi_name;
+ uint32_t fwi_mode;
+} dl_fault_walk_inner_t;
+
+static int
+dl_fault_walk_inner(topo_hdl_t *thp, tnode_t *node, void *arg)
+{
+ dl_fault_walk_inner_t *fwi = arg;
+ char *facname = NULL, *factype = NULL;
+ int err;
+
+ /*
+ * We're only interested in BAY children that are valid Facility Nodes.
+ */
+ if (topo_node_flags(node) != TOPO_NODE_FACILITY ||
+ get_facility_props(thp, node, &facname, &factype) != 0) {
+ goto out;
+ }
+
+ if (strcmp(fwi->fwi_name, facname) != 0)
+ goto out;
+
+ /*
+ * Attempt to set the LED mode appropriately. If this fails, give up
+ * and move on.
+ */
+ (void) topo_prop_set_uint32(node, TOPO_PGROUP_FACILITY, TOPO_LED_MODE,
+ TOPO_PROP_MUTABLE, fwi->fwi_mode, &err);
+
+out:
+ topo_hdl_strfree(thp, facname);
+ topo_hdl_strfree(thp, factype);
+ return (TOPO_WALK_NEXT);
+}
+
+static int
+dl_fault_walk_outer(topo_hdl_t *thp, tnode_t *node, void *arg)
+{
+ disk_lights_t *dl = arg;
+ dl_fault_walk_inner_t fwi;
+ tnode_t *pnode;
+ int err, has_fault;
+ nvlist_t *fmri = NULL;
+
+ bzero(&fwi, sizeof (fwi));
+
+ /*
+ * We are only looking for DISK nodes in the topology that have a parent
+ * BAY.
+ */
+ if (strcmp(DISK, topo_node_name(node)) != 0 ||
+ (pnode = topo_node_parent(node)) == NULL ||
+ strcmp(BAY, topo_node_name(pnode)) != 0) {
+ return (TOPO_WALK_NEXT);
+ }
+
+ /*
+ * Check to see if the Resource this FMRI describes is Faulty:
+ */
+ if (topo_node_resource(node, &fmri, &err) != 0)
+ return (TOPO_WALK_NEXT);
+ has_fault = fmd_nvl_fmri_has_fault(dl->dl_fmd, fmri,
+ FMD_HAS_FAULT_RESOURCE, NULL);
+ nvlist_free(fmri);
+
+ /*
+ * Walk the children of this BAY and flush out our fault status if
+ * we find an appropriate indicator node.
+ */
+ fwi.fwi_name = "fail";
+ fwi.fwi_mode = has_fault ? TOPO_LED_STATE_ON : TOPO_LED_STATE_OFF;
+ (void) topo_node_child_walk(thp, pnode, dl_fault_walk_inner, &fwi,
+ &err);
+
+ return (TOPO_WALK_NEXT);
+}
+
+/*
+ * Walk all of the topology nodes looking for DISKs that match the structure
+ * described in the overview. Once we find them, check their fault status
+ * and update their fault indiciator accordingly.
+ */
+static void
+dl_examine_topo(disk_lights_t *dl)
+{
+ int err;
+ topo_hdl_t *thp = NULL;
+ topo_walk_t *twp = NULL;
+
+ thp = fmd_hdl_topo_hold(dl->dl_fmd, TOPO_VERSION);
+ if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, dl_fault_walk_outer,
+ dl, &err)) == NULL) {
+ fmd_hdl_error(dl->dl_fmd, "failed to get topology: %s\n",
+ topo_strerror(err));
+ goto out;
+ }
+
+ if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) {
+ fmd_hdl_error(dl->dl_fmd, "failed to walk topology: %s\n",
+ topo_strerror(err));
+ goto out;
+ }
+
+out:
+ if (twp != NULL)
+ topo_walk_fini(twp);
+ if (thp != NULL)
+ fmd_hdl_topo_rele(dl->dl_fmd, thp);
+}
+
+static void
+dl_trigger_enum(disk_lights_t *dl)
+{
+ /*
+ * If we're already on the short-poll coalesce timer, then return
+ * immediately.
+ */
+ if (dl->dl_triggered == B_TRUE)
+ return;
+ dl->dl_triggered = B_TRUE;
+
+ /*
+ * Replace existing poll timer with coalesce timer:
+ */
+ if (dl->dl_timer != 0)
+ fmd_timer_remove(dl->dl_fmd, dl->dl_timer);
+ dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL,
+ dl->dl_coalesce_interval);
+}
+
+static void
+disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data)
+{
+ disk_lights_t *dl = fmd_hdl_getspecific(hdl);
+
+ dl->dl_triggered = B_FALSE;
+
+ dl_examine_topo(dl);
+
+ /*
+ * Install the long-interval timer for the next poll.
+ */
+ dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval);
+}
+
+static void
+disklights_topo(fmd_hdl_t *hdl, topo_hdl_t *thp)
+{
+ disk_lights_t *dl = fmd_hdl_getspecific(hdl);
+
+ dl_trigger_enum(dl);
+}
+
+static void
+disklights_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
+ const char *class)
+{
+ disk_lights_t *dl = fmd_hdl_getspecific(hdl);
+
+ dl_trigger_enum(dl);
+}
+
+void
+_fmd_init(fmd_hdl_t *hdl)
+{
+ disk_lights_t *dl;
+
+ if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
+ return;
+
+ dl = fmd_hdl_zalloc(hdl, sizeof (*dl), FMD_SLEEP);
+ fmd_hdl_setspecific(hdl, dl);
+
+ /*
+ * Load Configuration:
+ */
+ dl->dl_fmd = hdl;
+ dl->dl_poll_interval = fmd_prop_get_int64(hdl, DL_PROP_POLL_INTERVAL);
+ dl->dl_coalesce_interval = fmd_prop_get_int64(hdl,
+ DL_PROP_COALESCE_INTERVAL);
+
+ /*
+ * Schedule the initial enumeration:
+ */
+ dl_trigger_enum(dl);
+}
+
+void
+_fmd_fini(fmd_hdl_t *hdl)
+{
+ disk_lights_t *dl = fmd_hdl_getspecific(hdl);
+
+ fmd_hdl_free(hdl, dl, sizeof (*dl));
+}
diff --git a/usr/src/lib/fm/topo/libtopo/common/libtopo.h b/usr/src/lib/fm/topo/libtopo/common/libtopo.h
index 9b7102f882..3ea35cdddd 100644
--- a/usr/src/lib/fm/topo/libtopo/common/libtopo.h
+++ b/usr/src/lib/fm/topo/libtopo/common/libtopo.h
@@ -137,6 +137,8 @@ extern boolean_t topo_method_supported(tnode_t *, const char *,
topo_version_t);
extern int topo_node_facility(topo_hdl_t *, tnode_t *, const char *,
uint32_t, topo_faclist_t *, int *);
+extern int topo_node_child_walk(topo_hdl_t *, tnode_t *, topo_walk_cb_t,
+ void *, int *);
/*
* Node flags: denotes type of node
diff --git a/usr/src/lib/fm/topo/libtopo/common/mapfile-vers b/usr/src/lib/fm/topo/libtopo/common/mapfile-vers
index 1cde0dee1f..b81f4fd7c6 100644
--- a/usr/src/lib/fm/topo/libtopo/common/mapfile-vers
+++ b/usr/src/lib/fm/topo/libtopo/common/mapfile-vers
@@ -126,6 +126,7 @@ SYMBOL_VERSION SUNWprivate {
topo_node_asru;
topo_node_asru_set;
topo_node_bind;
+ topo_node_child_walk;
topo_node_facility;
topo_node_facbind;
topo_node_flags;
diff --git a/usr/src/lib/fm/topo/libtopo/common/topo_node.c b/usr/src/lib/fm/topo/libtopo/common/topo_node.c
index 553bc851f0..00b828c49c 100644
--- a/usr/src/lib/fm/topo/libtopo/common/topo_node.c
+++ b/usr/src/lib/fm/topo/libtopo/common/topo_node.c
@@ -837,3 +837,55 @@ topo_node_walk_init(topo_hdl_t *thp, topo_mod_t *mod, tnode_t *node,
return (wp);
}
+
+/*
+ * Walk the direct children of the given node.
+ */
+int
+topo_node_child_walk(topo_hdl_t *thp, tnode_t *pnode, topo_walk_cb_t cb_f,
+ void *arg, int *errp)
+{
+ int ret = TOPO_WALK_TERMINATE;
+ tnode_t *cnode;
+
+ topo_node_hold(pnode);
+
+ /*
+ * First Child:
+ */
+ topo_node_lock(pnode);
+ cnode = topo_child_first(pnode);
+ topo_node_unlock(pnode);
+
+ if (cnode == NULL) {
+ *errp = ETOPO_WALK_EMPTY;
+ ret = TOPO_WALK_ERR;
+ goto out;
+ }
+
+ while (cnode != NULL) {
+ int iret;
+
+ /*
+ * Call the walker callback:
+ */
+ topo_node_hold(cnode);
+ iret = cb_f(thp, cnode, arg);
+ topo_node_rele(cnode);
+ if (iret != TOPO_WALK_NEXT) {
+ ret = iret;
+ break;
+ }
+
+ /*
+ * Next child:
+ */
+ topo_node_lock(pnode);
+ cnode = topo_child_next(pnode, cnode);
+ topo_node_unlock(pnode);
+ }
+
+out:
+ topo_node_rele(pnode);
+ return (ret);
+}