summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Hanson <Stephen.Hanson@Sun.COM>2009-09-26 09:41:57 -0700
committerStephen Hanson <Stephen.Hanson@Sun.COM>2009-09-26 09:41:57 -0700
commit5750ef5c2584f7399d9b98bfd513c0ca9f79f66e (patch)
tree15a89d35e4c9fa75fe19a41f4bd8a0cc24413df2
parente07b36b5f925802c7a364a9b5dcbd2a8d184e2c6 (diff)
downloadillumos-gate-5750ef5c2584f7399d9b98bfd513c0ca9f79f66e.tar.gz
6778240 generic historic diagnosis rules
-rw-r--r--usr/src/cmd/fm/fmadm/common/faulty.c158
-rw-r--r--usr/src/cmd/fm/fmd/common/fmd.c7
-rw-r--r--usr/src/cmd/fm/fmd/common/fmd_asru.c96
-rw-r--r--usr/src/cmd/fm/fmd/common/fmd_asru.h4
-rw-r--r--usr/src/cmd/fm/fmd/common/fmd_case.c623
-rw-r--r--usr/src/cmd/fm/fmd/common/fmd_case.h2
-rw-r--r--usr/src/cmd/fm/fmd/common/fmd_protocol.c3
-rw-r--r--usr/src/cmd/fm/fmd/common/fmd_protocol.h2
-rw-r--r--usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c17
-rw-r--r--usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c25
-rw-r--r--usr/src/cmd/fm/modules/common/io-retire/rio_main.c48
-rw-r--r--usr/src/uts/common/sys/fm/protocol.h1
12 files changed, 744 insertions, 242 deletions
diff --git a/usr/src/cmd/fm/fmadm/common/faulty.c b/usr/src/cmd/fm/fmadm/common/faulty.c
index be8c98484e..313f36a533 100644
--- a/usr/src/cmd/fm/fmadm/common/faulty.c
+++ b/usr/src/cmd/fm/fmadm/common/faulty.c
@@ -591,48 +591,6 @@ merge_name_list(name_list_t **list, name_list_t *new, int add_pct)
return (rt);
}
-/*
- * compare entries in two lists return true if the two lists have identical
- * content. The two lists may not have entries in the same order, so we compare
- * the size of the list as well as trying to find every entry from one list in
- * the other.
- */
-static int
-cmp_name_list(name_list_t *lxp1, name_list_t *lxp2)
-{
- name_list_t *lp1, *lp2;
- int l1 = 0, l2 = 0, common = 0;
-
- lp2 = lxp2;
- while (lp2) {
- l2++;
- lp2 = lp2->next;
- if (lp2 == lxp2)
- break;
- }
- lp1 = lxp1;
- while (lp1) {
- l1++;
- lp2 = lxp2;
- while (lp2) {
- if (strcmp(lp2->name, lp1->name) == 0) {
- common++;
- break;
- }
- lp2 = lp2->next;
- if (lp2 == lxp2)
- break;
- }
- lp1 = lp1->next;
- if (lp1 == lxp1)
- break;
- }
- if (l1 == l2 && l2 == common)
- return (0);
- else
- return (1);
-}
-
static name_list_t *
alloc_name_list(char *name, uint8_t pct)
{
@@ -650,24 +608,6 @@ alloc_name_list(char *name, uint8_t pct)
return (nlp);
}
-static void
-free_name_list(name_list_t *list)
-{
- name_list_t *next = list;
- name_list_t *lp;
-
- if (list) {
- do {
- lp = next;
- next = lp->next;
- if (lp->label)
- free(lp->label);
- free(lp->name);
- free(lp);
- } while (next != list);
- }
-}
-
static status_record_t *
new_record_init(uurec_t *uurec_p, char *msgid, name_list_t *class,
name_list_t *fru, name_list_t *asru, name_list_t *resource,
@@ -863,64 +803,6 @@ catalog_new_record(uurec_t *uurec_p, char *msgid, name_list_t *class,
add_list(status_rec_p, status_rec_p->asru, &status_asru_list);
}
-/*
- * add uuid and diagnoses time to an existing record for similar fault on the
- * same fru
- */
-static void
-catalog_merge_record(status_record_t *status_rec_p, uurec_t *uurec_p,
- name_list_t *asru, name_list_t *resource, name_list_t *serial,
- boolean_t not_suppressed)
-{
- uurec_t *uurec1_p;
-
- status_rec_p->nrecs++;
- /* add uurec in time order */
- if (status_rec_p->uurec->sec > uurec_p->sec) {
- uurec_p->next = status_rec_p->uurec;
- uurec_p->prev = NULL;
- status_rec_p->uurec = uurec_p;
- } else {
- uurec1_p = status_rec_p->uurec;
- while (uurec1_p->next && uurec1_p->next->sec <= uurec_p->sec)
- uurec1_p = uurec1_p->next;
- if (uurec1_p->next)
- uurec1_p->next->prev = uurec_p;
- uurec_p->next = uurec1_p->next;
- uurec_p->prev = uurec1_p;
- uurec1_p->next = uurec_p;
- }
- status_rec_p->not_suppressed |= not_suppressed;
- uurec_p->asru = merge_name_list(&status_rec_p->asru, asru, 0);
- (void) merge_name_list(&status_rec_p->resource, resource, 0);
- (void) merge_name_list(&status_rec_p->serial, serial, 0);
-}
-
-static status_record_t *
-record_in_catalog(name_list_t *class, name_list_t *fru,
- char *msgid, hostid_t *host)
-{
- sr_list_t *status_rec_p;
- status_record_t *srp = NULL;
-
- status_rec_p = status_rec_list;
- while (status_rec_p) {
- srp = status_rec_p->status_record;
- if (host == srp->host &&
- cmp_name_list(class, srp->class) == 0 &&
- cmp_name_list(fru, srp->fru) == 0 &&
- strcmp(msgid, srp->msgid) == 0)
- break;
- if (status_rec_p->next == status_rec_list) {
- srp = NULL;
- break;
- } else {
- status_rec_p = status_rec_p->next;
- }
- }
- return (srp);
-}
-
static void
get_serial_no(nvlist_t *nvl, name_list_t **serial_p, uint8_t pct)
{
@@ -993,6 +875,15 @@ extract_record_info(nvlist_t *nvl, name_list_t **class_p,
(void) merge_name_list(fru_p, nlp, 1);
}
get_serial_no(lfru, serial_p, lpct);
+ } else if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0) {
+ /*
+ * No FRU or resource. But we want to display the repair status
+ * somehow, so create a dummy FRU field.
+ */
+ nlp = alloc_name_list(dgettext("FMD", "None"), lpct);
+ nlp->status = status & ~(FM_SUSPECT_UNUSABLE |
+ FM_SUSPECT_DEGRADED);
+ (void) merge_name_list(fru_p, nlp, 1);
}
if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &lasru) == 0) {
name = get_nvl2str_topo(lasru);
@@ -1029,7 +920,6 @@ add_fault_record_to_catalog(nvlist_t *nvl, uint64_t sec, char *uuid)
name_list_t *asru = NULL, *fru = NULL, *serial = NULL;
nvlist_t **nva;
uint8_t *ba;
- status_record_t *status_rec_p;
uurec_t *uurec_p;
hostid_t *host;
boolean_t not_suppressed = 1;
@@ -1066,19 +956,8 @@ add_fault_record_to_catalog(nvlist_t *nvl, uint64_t sec, char *uuid)
uurec_p->event = NULL;
(void) nvlist_dup(nvl, &uurec_p->event, 0);
host = find_hostid(nvl);
- if (not_suppressed && !opt_g)
- status_rec_p = NULL;
- else
- status_rec_p = record_in_catalog(class, fru, msgid, host);
- if (status_rec_p) {
- catalog_merge_record(status_rec_p, uurec_p, asru, resource,
- serial, not_suppressed);
- free_name_list(class);
- free_name_list(fru);
- } else {
- catalog_new_record(uurec_p, msgid, class, fru, asru,
- resource, serial, not_suppressed, host);
- }
+ catalog_new_record(uurec_p, msgid, class, fru, asru,
+ resource, serial, not_suppressed, host);
}
static void
@@ -1473,10 +1352,17 @@ print_sup_record(status_record_t *srp, int opt_i, int full)
}
if (full || srp->fru == NULL || srp->asru == NULL) {
if (srp->resource) {
- print_name_list(srp->resource,
- dgettext("FMD", "Problem in :"),
- NULL, full ? 0 : max_display, 0, print_rsrc_status,
- full);
+ status = asru_same_status(srp->resource);
+ if (status != -1) {
+ print_name_list(srp->resource,
+ dgettext("FMD", "Problem in :"), NULL,
+ full ? 0 : max_display, 0, NULL, full);
+ print_rsrc_status(status, " ");
+ } else
+ print_name_list(srp->resource,
+ dgettext("FMD", "Problem in :"),
+ NULL, full ? 0 : max_display, 0,
+ print_rsrc_status, full);
}
}
if (srp->fru) {
diff --git a/usr/src/cmd/fm/fmd/common/fmd.c b/usr/src/cmd/fm/fmd/common/fmd.c
index e904c2c74d..fe61e14a79 100644
--- a/usr/src/cmd/fm/fmd/common/fmd.c
+++ b/usr/src/cmd/fm/fmd/common/fmd.c
@@ -288,6 +288,7 @@ static const fmd_conf_formal_t _fmd_conf[] = {
{ "rpc.api.prog", &fmd_conf_uint32, "100170" }, /* FMD_API rpc program num */
{ "rpc.rcvsize", &fmd_conf_size, "128k" }, /* rpc receive buffer size */
{ "rpc.sndsize", &fmd_conf_size, "128k" }, /* rpc send buffer size */
+{ "rsrc.pollperiod", &fmd_conf_time, "1h" }, /* aged rsrcs poller period */
{ "rsrc.age", &fmd_conf_time, "30d" }, /* max age of old rsrc log */
{ "rsrc.zero", &fmd_conf_bool, "false" }, /* zero rsrc cache on start? */
{ "schemedir", &fmd_conf_string, _fmd_scheme_path }, /* path for scheme mods */
@@ -705,12 +706,12 @@ fmd_gc(fmd_t *dp, id_t id, hrtime_t hrt)
static void
fmd_clear_aged_rsrcs(fmd_t *dp, id_t id, hrtime_t hrt)
{
- hrtime_t delta;
+ hrtime_t period;
fmd_asru_clear_aged_rsrcs();
- (void) fmd_conf_getprop(dp->d_conf, "rsrc.age", &delta);
+ (void) fmd_conf_getprop(dp->d_conf, "rsrc.pollperiod", &period);
(void) fmd_timerq_install(dp->d_timers, dp->d_rmod->mod_timerids,
- (fmd_timer_f *)fmd_clear_aged_rsrcs, dp, NULL, delta/10);
+ (fmd_timer_f *)fmd_clear_aged_rsrcs, dp, NULL, period);
}
/*
diff --git a/usr/src/cmd/fm/fmd/common/fmd_asru.c b/usr/src/cmd/fm/fmd/common/fmd_asru.c
index 07a98d51fe..31e98168e0 100644
--- a/usr/src/cmd/fm/fmd/common/fmd_asru.c
+++ b/usr/src/cmd/fm/fmd/common/fmd_asru.c
@@ -441,7 +441,7 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp)
boolean_t faulty = FMD_B_FALSE, unusable = FMD_B_FALSE;
int ps;
boolean_t repaired = FMD_B_FALSE, replaced = FMD_B_FALSE;
- boolean_t acquitted = FMD_B_FALSE;
+ boolean_t acquitted = FMD_B_FALSE, resolved = FMD_B_FALSE;
nvlist_t *flt, *flt_copy, *asru;
char *case_uuid = NULL, *case_code = NULL;
fmd_asru_t *ap;
@@ -481,17 +481,20 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp)
&replaced);
(void) nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_ACQUITTED,
&acquitted);
+ (void) nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_RESOLVED,
+ &resolved);
/*
- * Attempt to recreate the case in either the CLOSED or REPAIRED state
- * (depending on whether the faulty bit is still set).
+ * Attempt to recreate the case in CLOSED, REPAIRED or RESOLVED state
+ * (depending on whether the faulty/resolved bits are set).
* If the case is already present, fmd_case_recreate() will return it.
* If not, we'll create a new orphaned case. Either way, we use the
* ASRU event to insert a suspect into the partially-restored case.
*/
fmd_module_lock(fmd.d_rmod);
cp = fmd_case_recreate(fmd.d_rmod, NULL, faulty ? FMD_CASE_CLOSED :
- FMD_CASE_REPAIRED, case_uuid, case_code);
+ resolved ? FMD_CASE_RESOLVED : FMD_CASE_REPAIRED, case_uuid,
+ case_code);
fmd_case_hold(cp);
fmd_module_unlock(fmd.d_rmod);
if (nvlist_lookup_int64_array(nvl, FM_SUSPECT_DIAG_TIME, &diag_time,
@@ -581,6 +584,8 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp)
alp->al_reason = FMD_ASRU_REPAIRED;
else if (acquitted)
alp->al_reason = FMD_ASRU_ACQUITTED;
+ else
+ alp->al_reason = FMD_ASRU_REMOVED;
TRACE((FMD_DBG_ASRU, "asru %s recreated as %p (%s)", alp->al_uuid,
(void *)ap, _fmd_asru_snames[ap->asru_flags & FMD_ASRU_STATE]));
@@ -712,6 +717,9 @@ fmd_asru_repair_if_aged(fmd_asru_link_t *alp, void *arg)
int err;
fmd_asru_rep_arg_t fara;
+ if (!(alp->al_flags & FMD_ASRU_FAULTY))
+ return;
+
/*
* Checking for aged resources only happens on the diagnosing side
* not on a proxy.
@@ -740,10 +748,55 @@ fmd_asru_repair_if_aged(fmd_asru_link_t *alp, void *arg)
}
}
+/*ARGSUSED*/
+void
+fmd_asru_check_if_aged(fmd_asru_link_t *alp, void *arg)
+{
+ struct timeval tv;
+ fmd_log_t *lp;
+ hrtime_t hrt;
+
+ /*
+ * Case must be in resolved state for this to be called. So modified
+ * time on resource cache entry should be the time the resolve occurred.
+ * Return 0 if not yet hit rsrc.aged.
+ */
+ fmd_time_gettimeofday(&tv);
+ lp = fmd_log_open(alp->al_asru->asru_root, alp->al_uuid, FMD_LOG_ASRU);
+ if (lp == NULL)
+ return;
+ hrt = (hrtime_t)(tv.tv_sec - lp->log_stat.st_mtime);
+ fmd_log_rele(lp);
+ if (hrt * NANOSEC < fmd.d_asrus->ah_lifetime)
+ *(int *)arg = 0;
+}
+
+/*ARGSUSED*/
+void
+fmd_asru_most_recent(fmd_asru_link_t *alp, void *arg)
+{
+ fmd_log_t *lp;
+ uint64_t hrt;
+
+ /*
+ * Find most recent modified time of a set of resource cache entries.
+ */
+ lp = fmd_log_open(alp->al_asru->asru_root, alp->al_uuid, FMD_LOG_ASRU);
+ if (lp == NULL)
+ return;
+ hrt = lp->log_stat.st_mtime;
+ fmd_log_rele(lp);
+ if (*(uint64_t *)arg < hrt)
+ *(uint64_t *)arg = hrt;
+}
+
void
fmd_asru_clear_aged_rsrcs()
{
+ int check_if_aged = 1;
fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_repair_if_aged, NULL);
+ fmd_case_hash_apply(fmd.d_cases, fmd_case_discard_resolved,
+ &check_if_aged);
}
fmd_asru_hash_t *
@@ -1298,6 +1351,22 @@ fmd_asru_repaired(fmd_asru_link_t *alp, void *arg)
}
/*
+ * Discard the case associated with this alp if it is in resolved state.
+ * Called on "fmadm flush".
+ */
+/*ARGSUSED*/
+void
+fmd_asru_flush(fmd_asru_link_t *alp, void *arg)
+{
+ int check_if_aged = 0;
+ int *rval = (int *)arg;
+
+ if (alp->al_case)
+ fmd_case_discard_resolved(alp->al_case, &check_if_aged);
+ *rval = 0;
+}
+
+/*
* This is only called for proxied faults. Set various flags so we can
* find the nature of the transport from the resource cache code.
*/
@@ -1459,7 +1528,8 @@ fmd_asru_logevent(fmd_asru_link_t *alp)
nvl = fmd_protocol_rsrc_asru(_fmd_asru_events[faulty | (unusable << 1)],
alp->al_asru_fmri, cip->ci_uuid, cip->ci_code, faulty, unusable,
message, alp->al_event, &cip->ci_tv, repaired, replaced, acquitted,
- cip->ci_diag_de == NULL ? cip->ci_mod->mod_fmri : cip->ci_diag_de);
+ cip->ci_state == FMD_CASE_RESOLVED, cip->ci_diag_de == NULL ?
+ cip->ci_mod->mod_fmri : cip->ci_diag_de);
(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
@@ -1525,7 +1595,9 @@ fmd_asru_clrflags(fmd_asru_link_t *alp, uint_t sflag, uint8_t reason)
nstate = alp->al_flags & FMD_ASRU_STATE;
if (nstate == ostate) {
- if (reason > alp->al_reason) {
+ if (reason > alp->al_reason &&
+ ((fmd_case_impl_t *)alp->al_case)->ci_state <
+ FMD_CASE_REPAIRED) {
alp->al_reason = reason;
fmd_asru_logevent(alp);
(void) pthread_cond_broadcast(&ap->asru_cv);
@@ -1560,6 +1632,18 @@ fmd_asru_clrflags(fmd_asru_link_t *alp, uint_t sflag, uint8_t reason)
return (1);
}
+/*ARGSUSED*/
+void
+fmd_asru_log_resolved(fmd_asru_link_t *alp, void *unused)
+{
+ fmd_asru_t *ap = alp->al_asru;
+
+ (void) pthread_mutex_lock(&ap->asru_lock);
+ fmd_asru_logevent(alp);
+ (void) pthread_cond_broadcast(&ap->asru_cv);
+ (void) pthread_mutex_unlock(&ap->asru_lock);
+}
+
/*
* Report the current known state of the link entry (ie this particular fault
* affecting this particular ASRU).
diff --git a/usr/src/cmd/fm/fmd/common/fmd_asru.h b/usr/src/cmd/fm/fmd/common/fmd_asru.h
index 4d3bc8042a..c20f376df2 100644
--- a/usr/src/cmd/fm/fmd/common/fmd_asru.h
+++ b/usr/src/cmd/fm/fmd/common/fmd_asru.h
@@ -195,6 +195,7 @@ typedef struct {
char *fara_uuid; /* uuid can be passed in for comparison */
} fmd_asru_rep_arg_t;
extern void fmd_asru_repaired(fmd_asru_link_t *, void *);
+extern void fmd_asru_flush(fmd_asru_link_t *, void *);
typedef struct {
int *faus_countp;
@@ -225,8 +226,11 @@ extern void fmd_asru_close_status(fmd_asru_link_t *alp, void *arg);
extern int fmd_asru_setflags(fmd_asru_link_t *, uint_t);
extern int fmd_asru_clrflags(fmd_asru_link_t *, uint_t, uint8_t);
+extern void fmd_asru_log_resolved(fmd_asru_link_t *, void *);
extern int fmd_asru_al_getstate(fmd_asru_link_t *);
extern int fmd_asru_getstate(fmd_asru_t *);
+extern void fmd_asru_check_if_aged(fmd_asru_link_t *, void *);
+void fmd_asru_most_recent(fmd_asru_link_t *, void *);
#ifdef __cplusplus
}
diff --git a/usr/src/cmd/fm/fmd/common/fmd_case.c b/usr/src/cmd/fm/fmd/common/fmd_case.c
index 8a03b670eb..5e40c593b2 100644
--- a/usr/src/cmd/fm/fmd/common/fmd_case.c
+++ b/usr/src/cmd/fm/fmd/common/fmd_case.c
@@ -220,6 +220,39 @@ fmd_case_hash_apply(fmd_case_hash_t *chp,
}
static void
+fmd_case_hash_apply_except_current(fmd_case_hash_t *chp,
+ void (*func)(fmd_case_t *, void *), void *arg, fmd_case_t *current)
+{
+ fmd_case_impl_t *cp, **cps, **cpp;
+ uint_t cpc, i;
+
+ (void) pthread_rwlock_rdlock(&chp->ch_lock);
+
+ cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
+ cpc = chp->ch_count;
+
+ for (i = 0; i < chp->ch_hashlen; i++) {
+ for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
+ if (cp != (fmd_case_impl_t *)current)
+ *cpp++ = fmd_case_tryhold(cp);
+ else
+ *cpp++ = cp;
+ }
+
+ ASSERT(cpp == cps + cpc);
+ (void) pthread_rwlock_unlock(&chp->ch_lock);
+
+ for (i = 0; i < cpc; i++) {
+ if (cps[i] != NULL && cps[i] != (fmd_case_impl_t *)current) {
+ func((fmd_case_t *)cps[i], arg);
+ fmd_case_rele((fmd_case_t *)cps[i]);
+ }
+ }
+
+ fmd_free(cps, cpc * sizeof (fmd_case_t *));
+}
+
+static void
fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
{
uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
@@ -453,6 +486,12 @@ fmd_case_mkevent(fmd_case_t *cp, const char *class)
return (nvl);
}
+static int fmd_case_match_on_faulty_overlap = 1;
+static int fmd_case_match_on_acquit_overlap = 1;
+static int fmd_case_auto_acquit_isolated = 1;
+static int fmd_case_auto_acquit_non_acquitted = 1;
+static int fmd_case_too_recent = 10; /* time in seconds */
+
static boolean_t
fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
{
@@ -498,82 +537,377 @@ done:
}
static int
-fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis)
+fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2)
{
char *class, *new_class;
- if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU))
+ if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU))
return (0);
- if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl,
- FM_FAULT_RESOURCE))
+ if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE))
return (0);
- if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU))
+ if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU))
return (0);
- (void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class);
- (void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class);
+ (void) nvlist_lookup_string(nvl2, FM_CLASS, &class);
+ (void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class);
return (strcmp(class, new_class) == 0);
}
+typedef struct {
+ int *fcms_countp;
+ int fcms_maxcount;
+ fmd_case_impl_t *fcms_cip;
+ uint8_t *fcms_new_susp_state;
+ uint8_t *fcms_old_susp_state;
+ uint8_t *fcms_old_match_state;
+} fcms_t;
+#define SUSPECT_STATE_FAULTY 0x1
+#define SUSPECT_STATE_ISOLATED 0x2
+#define SUSPECT_STATE_REMOVED 0x4
+#define SUSPECT_STATE_ACQUITED 0x8
+#define SUSPECT_STATE_REPAIRED 0x10
+#define SUSPECT_STATE_REPLACED 0x20
+#define SUSPECT_STATE_NO_MATCH 0x1
+
/*
- * see if an identical suspect list already exists in the cache
+ * This is called for each suspect in the old case. Compare it against each
+ * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state
+ * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not
+ * found in the old case.
*/
-static int
-fmd_case_check_for_dups(fmd_case_t *cp)
+static void
+fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg)
{
- fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip;
- fmd_case_hash_t *chp = fmd.d_cases;
- fmd_case_susp_t *xcis, *cis;
- int match = 0, match_susp;
- uint_t h;
+ fcms_t *fcmsp = (fcms_t *)arg;
+ fmd_case_impl_t *cip = fcmsp->fcms_cip;
+ fmd_case_susp_t *cis;
+ int i = 0;
+ int state = fmd_asru_al_getstate(alp);
- (void) pthread_rwlock_rdlock(&chp->ch_lock);
+ if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount)
+ return;
+
+ if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) &&
+ alp->al_reason == FMD_ASRU_REMOVED))
+ fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
+ SUSPECT_STATE_REMOVED;
+ else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY))
+ fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
+ SUSPECT_STATE_ISOLATED;
+ else if (state & FMD_ASRU_FAULTY)
+ fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
+ SUSPECT_STATE_FAULTY;
+ else if (alp->al_reason == FMD_ASRU_REPLACED)
+ fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
+ SUSPECT_STATE_REPLACED;
+ else if (alp->al_reason == FMD_ASRU_ACQUITTED)
+ fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
+ SUSPECT_STATE_ACQUITED;
+ else
+ fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
+ SUSPECT_STATE_REPAIRED;
+
+ for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++)
+ if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1)
+ break;
+ if (cis != NULL)
+ fcmsp->fcms_new_susp_state[i] =
+ fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp];
+ else
+ fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |=
+ SUSPECT_STATE_NO_MATCH;
+ (*fcmsp->fcms_countp)++;
+}
+
+typedef struct {
+ int *fca_do_update;
+ fmd_case_impl_t *fca_cip;
+} fca_t;
+
+/*
+ * Re-fault all acquitted suspects that are still present in the new list.
+ */
+static void
+fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg)
+{
+ fca_t *fcap = (fca_t *)arg;
+ fmd_case_impl_t *cip = fcap->fca_cip;
+ fmd_case_susp_t *cis;
+ int state = fmd_asru_al_getstate(alp);
+
+ if (!(state & FMD_ASRU_FAULTY) &&
+ alp->al_reason == FMD_ASRU_ACQUITTED) {
+ for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
+ if (fmd_case_match_suspect(cis->cis_nvl,
+ alp->al_event) == 1)
+ break;
+ if (cis != NULL) {
+ (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
+ *fcap->fca_do_update = 1;
+ }
+ }
+}
+
+/*
+ * Re-fault all suspects that are still present in the new list.
+ */
+static void
+fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg)
+{
+ fca_t *fcap = (fca_t *)arg;
+ fmd_case_impl_t *cip = fcap->fca_cip;
+ fmd_case_susp_t *cis;
+ int state = fmd_asru_al_getstate(alp);
+
+ if (!(state & FMD_ASRU_FAULTY)) {
+ for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
+ if (fmd_case_match_suspect(cis->cis_nvl,
+ alp->al_event) == 1)
+ break;
+ if (cis != NULL) {
+ (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
+ *fcap->fca_do_update = 1;
+ }
+ }
+}
+
+/*
+ * Acquit all suspects that are no longer present in the new list.
+ */
+static void
+fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg)
+{
+ fca_t *fcap = (fca_t *)arg;
+ fmd_case_impl_t *cip = fcap->fca_cip;
+ fmd_case_susp_t *cis;
+ int state = fmd_asru_al_getstate(alp);
+
+ if (state & FMD_ASRU_FAULTY) {
+ for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
+ if (fmd_case_match_suspect(cis->cis_nvl,
+ alp->al_event) == 1)
+ break;
+ if (cis == NULL) {
+ (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
+ FMD_ASRU_ACQUITTED);
+ *fcap->fca_do_update = 1;
+ }
+ }
+}
+
+/*
+ * Acquit all isolated suspects.
+ */
+static void
+fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg)
+{
+ int *do_update = (int *)arg;
+ int state = fmd_asru_al_getstate(alp);
+
+ if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) &&
+ (state & FMD_ASRU_FAULTY)) {
+ (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
+ FMD_ASRU_ACQUITTED);
+ *do_update = 1;
+ }
+}
+
+/*
+ * Acquit suspect which matches specified nvlist
+ */
+static void
+fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg)
+{
+ nvlist_t *nvl = (nvlist_t *)arg;
+ int state = fmd_asru_al_getstate(alp);
+
+ if ((state & FMD_ASRU_FAULTY) &&
+ fmd_case_match_suspect(nvl, alp->al_event) == 1)
+ (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
+ FMD_ASRU_ACQUITTED);
+}
+
+typedef struct {
+ fmd_case_impl_t *fccd_cip;
+ uint8_t *fccd_new_susp_state;
+ uint8_t *fccd_new_match_state;
+ int *fccd_discard_new;
+ int *fccd_adjust_new;
+} fccd_t;
+
+/*
+ * see if a matching suspect list already exists in the cache
+ */
+static void
+fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg)
+{
+ fccd_t *fccdp = (fccd_t *)arg;
+ fmd_case_impl_t *new_cip = fccdp->fccd_cip;
+ fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp;
+ int i, count = 0, do_update = 0, got_isolated_overlap = 0;
+ int got_faulty_overlap = 0;
+ int got_acquit_overlap = 0;
+ boolean_t too_recent;
+ uint64_t most_recent = 0;
+ fcms_t fcms;
+ fca_t fca;
+ uint8_t *new_susp_state;
+ uint8_t *old_susp_state;
+ uint8_t *old_match_state;
+
+ new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t));
+ for (i = 0; i < new_cip->ci_nsuspects; i++)
+ new_susp_state[i] = 0;
+ old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
+ for (i = 0; i < old_cip->ci_nsuspects; i++)
+ old_susp_state[i] = 0;
+ old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
+ for (i = 0; i < old_cip->ci_nsuspects; i++)
+ old_match_state[i] = 0;
/*
- * Find all cases with this code
+ * Compare with each suspect in the existing case.
*/
- h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
- for (xcip = chp->ch_code_hash[h]; xcip != NULL;
- xcip = xcip->ci_code_next) {
- /*
- * only look for any cases (apart from this one)
- * whose code and number of suspects match
- */
- if (xcip == cip || fmd_case_tryhold(xcip) == NULL)
- continue;
- if (strcmp(xcip->ci_code, cip->ci_code) != 0 ||
- xcip->ci_nsuspects != cip->ci_nsuspects) {
- fmd_case_rele((fmd_case_t *)xcip);
- continue;
+ fcms.fcms_countp = &count;
+ fcms.fcms_maxcount = old_cip->ci_nsuspects;
+ fcms.fcms_cip = new_cip;
+ fcms.fcms_new_susp_state = new_susp_state;
+ fcms.fcms_old_susp_state = old_susp_state;
+ fcms.fcms_old_match_state = old_match_state;
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip,
+ fmd_case_match_suspects, &fcms);
+
+ /*
+ * If we have some faulty, non-isolated suspects that overlap, then most
+ * likely it is the suspects that overlap in the suspect lists that are
+ * to blame. So we can consider this to be a match.
+ */
+ for (i = 0; i < new_cip->ci_nsuspects; i++)
+ if (new_susp_state[i] == SUSPECT_STATE_FAULTY)
+ got_faulty_overlap = 1;
+ if (got_faulty_overlap && fmd_case_match_on_faulty_overlap)
+ goto got_match;
+
+ /*
+ * If we have no faulty, non-isolated suspects in the old case, but we
+ * do have some acquitted suspects that overlap, then most likely it is
+ * the acquitted suspects that overlap in the suspect lists that are
+ * to blame. So we can consider this to be a match.
+ */
+ for (i = 0; i < new_cip->ci_nsuspects; i++)
+ if (new_susp_state[i] == SUSPECT_STATE_ACQUITED)
+ got_acquit_overlap = 1;
+ for (i = 0; i < old_cip->ci_nsuspects; i++)
+ if (old_susp_state[i] == SUSPECT_STATE_FAULTY)
+ got_acquit_overlap = 0;
+ if (got_acquit_overlap && fmd_case_match_on_acquit_overlap)
+ goto got_match;
+
+ /*
+ * Check that all suspects in the new list are present in the old list.
+ * Return if we find one that isn't.
+ */
+ for (i = 0; i < new_cip->ci_nsuspects; i++)
+ if (new_susp_state[i] == 0)
+ return;
+
+ /*
+ * Check that all suspects in the old list are present in the new list
+ * *or* they are isolated or removed/replaced (which would explain why
+ * they are not present in the new list). Return if we find one that is
+ * faulty and unisolated or repaired or acquitted, and that is not
+ * present in the new case.
+ */
+ for (i = 0; i < old_cip->ci_nsuspects; i++)
+ if (old_match_state[i] == SUSPECT_STATE_NO_MATCH &&
+ (old_susp_state[i] == SUSPECT_STATE_FAULTY ||
+ old_susp_state[i] == SUSPECT_STATE_ACQUITED ||
+ old_susp_state[i] == SUSPECT_STATE_REPAIRED))
+ return;
+
+got_match:
+ /*
+ * If the old case is already in repaired/resolved state, we can't
+ * do anything more with it, so keep the new case, but acquit some
+ * of the suspects if appropriate.
+ */
+ if (old_cip->ci_state >= FMD_CASE_REPAIRED) {
+ if (fmd_case_auto_acquit_non_acquitted) {
+ *fccdp->fccd_adjust_new = 1;
+ for (i = 0; i < new_cip->ci_nsuspects; i++) {
+ fccdp->fccd_new_susp_state[i] |=
+ new_susp_state[i];
+ if (new_susp_state[i] == 0)
+ fccdp->fccd_new_susp_state[i] =
+ SUSPECT_STATE_NO_MATCH;
+ }
}
+ return;
+ }
+ /*
+ * Otherwise discard the new case and keep the old, again updating the
+ * state of the suspects as appropriate
+ */
+ *fccdp->fccd_discard_new = 1;
+ fca.fca_cip = new_cip;
+ fca.fca_do_update = &do_update;
+
+ /*
+ * See if new case occurred within fmd_case_too_recent seconds of the
+ * most recent modification to the old case and if so don't do
+ * auto-acquit. This avoids problems if a flood of ereports come in and
+ * they don't all get diagnosed before the first case causes some of
+ * the devices to be isolated making it appear that an isolated device
+ * was in the suspect list.
+ */
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
+ fmd_asru_most_recent, &most_recent);
+ too_recent = (new_cip->ci_tv.tv_sec - most_recent <
+ fmd_case_too_recent);
+
+ if (got_faulty_overlap) {
/*
- * For each suspect in one list, check if there
- * is an identical suspect in the other list
+ * Acquit any suspects not present in the new list, plus
+ * any that are are present but are isolated.
*/
- match = 1;
- for (xcis = xcip->ci_suspects; xcis != NULL;
- xcis = xcis->cis_next) {
- match_susp = 0;
- for (cis = cip->ci_suspects; cis != NULL;
- cis = cis->cis_next) {
- if (fmd_case_match_suspect(cis, xcis) == 1) {
- match_susp = 1;
- break;
- }
- }
- if (match_susp == 0) {
- match = 0;
- break;
- }
- }
- fmd_case_rele((fmd_case_t *)xcip);
- if (match) {
- (void) pthread_rwlock_unlock(&chp->ch_lock);
- return (1);
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
+ fmd_case_acquit_no_match, &fca);
+ if (fmd_case_auto_acquit_isolated && !too_recent)
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
+ fmd_case_acquit_isolated, &do_update);
+ } else if (got_acquit_overlap) {
+ /*
+ * Re-fault the acquitted matching suspects and acquit all
+ * isolated suspects.
+ */
+ if (fmd_case_auto_acquit_isolated && !too_recent) {
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
+ fmd_case_fault_acquitted_matching, &fca);
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
+ fmd_case_acquit_isolated, &do_update);
}
+ } else if (fmd_case_auto_acquit_isolated) {
+ /*
+ * To get here, there must be no faulty or acquitted suspects,
+ * but there must be at least one isolated suspect. Just acquit
+ * non-matching isolated suspects. If there are no matching
+ * isolated suspects, then re-fault all matching suspects.
+ */
+ for (i = 0; i < new_cip->ci_nsuspects; i++)
+ if (new_susp_state[i] == SUSPECT_STATE_ISOLATED)
+ got_isolated_overlap = 1;
+ if (!got_isolated_overlap)
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
+ fmd_case_fault_all_matching, &fca);
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
+ fmd_case_acquit_no_match, &fca);
}
- (void) pthread_rwlock_unlock(&chp->ch_lock);
- return (0);
+
+ /*
+ * If we've updated anything in the old case, call fmd_case_update()
+ */
+ if (do_update)
+ fmd_case_update(old_cp);
}
/*
@@ -610,22 +944,49 @@ fmd_case_convict(fmd_case_t *cp)
{
fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
fmd_asru_hash_t *ahp = fmd.d_asrus;
-
+ int discard_new = 0, i;
fmd_case_susp_t *cis;
fmd_asru_link_t *alp;
+ uint8_t *new_susp_state;
+ uint8_t *new_match_state;
+ int adjust_new = 0;
+ fccd_t fccd;
(void) pthread_mutex_lock(&cip->ci_lock);
if (cip->ci_code == NULL)
(void) fmd_case_mkcode(cp);
else if (cip->ci_precanned)
fmd_case_code_hash_insert(fmd.d_cases, cip);
- if (fmd_case_check_for_dups(cp) == 1) {
+
+ /*
+ * First we must see if any matching cases already exist.
+ */
+ new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
+ for (i = 0; i < cip->ci_nsuspects; i++)
+ new_susp_state[i] = 0;
+ new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
+ for (i = 0; i < cip->ci_nsuspects; i++)
+ new_match_state[i] = 0;
+ fccd.fccd_cip = cip;
+ fccd.fccd_adjust_new = &adjust_new;
+ fccd.fccd_new_susp_state = new_susp_state;
+ fccd.fccd_new_match_state = new_match_state;
+ fccd.fccd_discard_new = &discard_new;
+ fmd_case_hash_apply_except_current(fmd.d_cases, fmd_case_check_for_dups,
+ &fccd, cp);
+
+ if (discard_new) {
+ /*
+ * We've found an existing case that is a match and it is not
+ * already in repaired or resolved state. So we can close this
+ * one as a duplicate.
+ */
(void) pthread_mutex_unlock(&cip->ci_lock);
return (1);
}
/*
- * no suspect list already exists - allocate new cache entries
+ * Allocate new cache entries
*/
for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
if ((alp = fmd_asru_hash_create_entry(ahp,
@@ -640,6 +1001,45 @@ fmd_case_convict(fmd_case_t *cp)
(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
}
+ if (adjust_new) {
+ int some_suspect = 0, some_not_suspect = 0;
+
+ /*
+ * There is one or more matching case but they are already in
+ * repaired or resolved state. So we need to keep the new
+ * case, but we can adjust it. Repaired/removed/replaced
+ * suspects are unlikely to be to blame (unless there are
+ * actually two separate faults). So if we have a combination of
+ * repaired/replaced/removed suspects and acquitted suspects in
+ * the old lists, then we should acquit in the new list those
+ * that were repaired/replaced/removed in the old.
+ */
+ for (i = 0; i < cip->ci_nsuspects; i++) {
+ if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) ||
+ (new_susp_state[i] & SUSPECT_STATE_REPAIRED) ||
+ (new_susp_state[i] & SUSPECT_STATE_REMOVED) ||
+ (new_match_state[i] & SUSPECT_STATE_NO_MATCH))
+ some_not_suspect = 1;
+ else
+ some_suspect = 1;
+ }
+ if (some_suspect && some_not_suspect) {
+ for (cis = cip->ci_suspects, i = 0; cis != NULL;
+ cis = cis->cis_next, i++)
+ if ((new_susp_state[i] &
+ SUSPECT_STATE_REPLACED) ||
+ (new_susp_state[i] &
+ SUSPECT_STATE_REPAIRED) ||
+ (new_susp_state[i] &
+ SUSPECT_STATE_REMOVED) ||
+ (new_match_state[i] &
+ SUSPECT_STATE_NO_MATCH))
+ fmd_asru_hash_apply_by_case(fmd.d_asrus,
+ cp, fmd_case_acquit_suspect,
+ cis->cis_nvl);
+ }
+ }
+
(void) pthread_mutex_unlock(&cip->ci_lock);
return (0);
}
@@ -934,8 +1334,6 @@ fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
fmd_case_impl_t *eip;
- ASSERT(state < FMD_CASE_RESOLVED);
-
(void) pthread_mutex_init(&cip->ci_lock, NULL);
fmd_buf_hash_create(&cip->ci_bufs);
@@ -987,11 +1385,12 @@ fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
/*
* When recreating an orphan case, state passed in may
- * either be CLOSED (faulty) or REPAIRED (!faulty). If
+ * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If
* any suspects are still CLOSED (faulty) then the
* overall state needs to be CLOSED.
*/
- if (cip->ci_state == FMD_CASE_REPAIRED &&
+ if ((cip->ci_state == FMD_CASE_REPAIRED ||
+ cip->ci_state == FMD_CASE_RESOLVED) &&
state == FMD_CASE_CLOSED)
cip->ci_state = FMD_CASE_CLOSED;
(void) pthread_mutex_unlock(&cip->ci_lock);
@@ -1397,13 +1796,8 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
* using fmd_xprt_uuresolved().
*/
if (flags & FMD_CF_RESOLVED) {
- if (cip->ci_xprt != NULL) {
+ if (cip->ci_xprt != NULL)
fmd_list_delete(&cip->ci_mod->mod_cases, cip);
- } else {
- fmd_module_lock(cip->ci_mod);
- fmd_list_delete(&cip->ci_mod->mod_cases, cip);
- fmd_module_unlock(cip->ci_mod);
- }
} else {
fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
fmd_case_unusable_and_present,
@@ -1414,9 +1808,6 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
break;
}
- fmd_module_lock(cip->ci_mod);
- fmd_list_delete(&cip->ci_mod->mod_cases, cip);
- fmd_module_unlock(cip->ci_mod);
}
cip->ci_state = FMD_CASE_RESOLVED;
@@ -1455,9 +1846,6 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
return;
}
- fmd_module_lock(cip->ci_mod);
- fmd_list_delete(&cip->ci_mod->mod_cases, cip);
- fmd_module_unlock(cip->ci_mod);
resolved = 1;
break;
}
@@ -1482,17 +1870,73 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
}
if (resolved) {
- /*
- * If we transitioned to RESOLVED, adjust the reference count to
- * reflect our removal from fmd.d_rmod->mod_cases above. If the
- * caller has not placed an additional hold on the case, it
- * will now be freed.
- */
- (void) pthread_mutex_lock(&cip->ci_lock);
- fmd_asru_hash_delete_case(fmd.d_asrus, cp);
+ if (cip->ci_xprt != NULL) {
+ /*
+ * If we transitioned to RESOLVED, adjust the reference
+ * count to reflect our removal from
+ * fmd.d_rmod->mod_cases above. If the caller has not
+ * placed an additional hold on the case, it will now
+ * be freed.
+ */
+ (void) pthread_mutex_lock(&cip->ci_lock);
+ fmd_asru_hash_delete_case(fmd.d_asrus, cp);
+ (void) pthread_mutex_unlock(&cip->ci_lock);
+ fmd_case_rele(cp);
+ } else {
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
+ fmd_asru_log_resolved, NULL);
+ (void) pthread_mutex_lock(&cip->ci_lock);
+ /* mark as "ready to be discarded */
+ cip->ci_flags |= FMD_CF_RES_CMPL;
+ (void) pthread_mutex_unlock(&cip->ci_lock);
+ }
+ }
+}
+
+/*
+ * Discard any case if it is in RESOLVED state (and if check_if_aged argument
+ * is set if all suspects have passed the rsrc.aged time).
+ */
+void
+fmd_case_discard_resolved(fmd_case_t *cp, void *arg)
+{
+ int check_if_aged = *(int *)arg;
+ fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
+
+ /*
+ * First check if case has completed transition to resolved.
+ */
+ (void) pthread_mutex_lock(&cip->ci_lock);
+ if (!(cip->ci_flags & FMD_CF_RES_CMPL)) {
(void) pthread_mutex_unlock(&cip->ci_lock);
- fmd_case_rele(cp);
+ return;
+ }
+
+ /*
+ * Now if check_is_aged is set, see if all suspects have aged.
+ */
+ if (check_if_aged) {
+ int aged = 1;
+
+ fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
+ fmd_asru_check_if_aged, &aged);
+ if (!aged) {
+ (void) pthread_mutex_unlock(&cip->ci_lock);
+ return;
+ }
}
+
+ /*
+ * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't
+ * do it twice.
+ */
+ fmd_module_lock(cip->ci_mod);
+ fmd_list_delete(&cip->ci_mod->mod_cases, cip);
+ fmd_module_unlock(cip->ci_mod);
+ fmd_asru_hash_delete_case(fmd.d_asrus, cp);
+ cip->ci_flags &= ~FMD_CF_RES_CMPL;
+ (void) pthread_mutex_unlock(&cip->ci_lock);
+ fmd_case_rele(cp);
}
/*
@@ -1964,7 +2408,7 @@ fmd_case_setcode(fmd_case_t *cp, char *code)
}
/*ARGSUSED*/
-void
+static void
fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
{
int not_faulty = 0;
@@ -1978,6 +2422,11 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
return;
+ if (cip->ci_state == FMD_CASE_RESOLVED) {
+ cip->ci_flags |= FMD_CF_RES_CMPL;
+ return;
+ }
+
fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
&not_faulty);
@@ -1991,9 +2440,6 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
fmd_case_unusable_and_present, &any_unusable_and_present);
if (!any_unusable_and_present) {
- fmd_module_lock(cip->ci_mod);
- fmd_list_delete(&cip->ci_mod->mod_cases, cip);
- fmd_module_unlock(cip->ci_mod);
cip->ci_state = FMD_CASE_RESOLVED;
TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
@@ -2007,10 +2453,7 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
cip->ci_uuid));
fmd_case_publish(cp, FMD_CASE_RESOLVED);
- (void) pthread_mutex_lock(&cip->ci_lock);
- fmd_asru_hash_delete_case(fmd.d_asrus, cp);
- (void) pthread_mutex_unlock(&cip->ci_lock);
- fmd_case_rele(cp);
+ cip->ci_flags |= FMD_CF_RES_CMPL;
} else {
TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
cip->ci_uuid));
diff --git a/usr/src/cmd/fm/fmd/common/fmd_case.h b/usr/src/cmd/fm/fmd/common/fmd_case.h
index 354e3f35a1..6e7b3ced54 100644
--- a/usr/src/cmd/fm/fmd/common/fmd_case.h
+++ b/usr/src/cmd/fm/fmd/common/fmd_case.h
@@ -96,6 +96,7 @@ typedef struct fmd_case_impl {
#define FMD_CF_RESOLVED 0x10 /* case has been resolved */
#define FMD_CF_INVISIBLE 0x20 /* case should be invisible */
#define FMD_CF_DELETING 0x40 /* case is about to be deleted */
+#define FMD_CF_RES_CMPL 0x80 /* transition to resolved is complete */
/*
* ci_proxy_asru flags record if we created a new asru on the proxy side and
@@ -160,6 +161,7 @@ extern int fmd_case_acquit(fmd_case_t *);
extern int fmd_case_contains(fmd_case_t *, fmd_event_t *);
extern int fmd_case_orphaned(fmd_case_t *);
extern void fmd_case_repair_replay(void);
+extern void fmd_case_discard_resolved(fmd_case_t *, void *);
#ifdef __cplusplus
}
diff --git a/usr/src/cmd/fm/fmd/common/fmd_protocol.c b/usr/src/cmd/fm/fmd/common/fmd_protocol.c
index 0a477a35d1..5e607486c8 100644
--- a/usr/src/cmd/fm/fmd/common/fmd_protocol.c
+++ b/usr/src/cmd/fm/fmd/common/fmd_protocol.c
@@ -206,7 +206,7 @@ fmd_protocol_rsrc_asru(const char *class,
nvlist_t *fmri, const char *uuid, const char *code,
boolean_t faulty, boolean_t unusable, boolean_t message, nvlist_t *event,
struct timeval *tvp, boolean_t repaired, boolean_t replaced,
- boolean_t acquitted, nvlist_t *diag_de)
+ boolean_t acquitted, boolean_t resolved, nvlist_t *diag_de)
{
nvlist_t *nvl;
int64_t tod[2];
@@ -233,6 +233,7 @@ fmd_protocol_rsrc_asru(const char *class,
err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_REPAIRED, repaired);
err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_REPLACED, replaced);
err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_ACQUITTED, acquitted);
+ err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_RESOLVED, resolved);
err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_UNUSABLE, unusable);
err |= nvlist_add_boolean_value(nvl, FM_SUSPECT_MESSAGE, message);
err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2);
diff --git a/usr/src/cmd/fm/fmd/common/fmd_protocol.h b/usr/src/cmd/fm/fmd/common/fmd_protocol.h
index 927a875ec3..c8f8dda280 100644
--- a/usr/src/cmd/fm/fmd/common/fmd_protocol.h
+++ b/usr/src/cmd/fm/fmd/common/fmd_protocol.h
@@ -74,7 +74,7 @@ extern nvlist_t *fmd_protocol_list(const char *, nvlist_t *,
struct timeval *);
extern nvlist_t *fmd_protocol_rsrc_asru(const char *, nvlist_t *,
const char *, const char *, boolean_t, boolean_t, boolean_t, nvlist_t *,
- struct timeval *m, boolean_t, boolean_t, boolean_t, nvlist_t *);
+ struct timeval *m, boolean_t, boolean_t, boolean_t, boolean_t, nvlist_t *);
extern nvlist_t *fmd_protocol_fmderror(int, const char *, va_list);
extern nvlist_t *fmd_protocol_moderror(struct fmd_module *, int, const char *);
extern nvlist_t *fmd_protocol_xprt_ctl(struct fmd_module *,
diff --git a/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c b/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c
index 47671d13ae..5c0a1e5e51 100644
--- a/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c
+++ b/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c
@@ -500,7 +500,22 @@ fmd_adm_rsrcflush_1_svc(char *name, int *rvp, struct svc_req *req)
{
int err = FMD_ADM_ERR_RSRCNOTF;
- fmd_adm_do_repair(name, req, &err, FMD_ASRU_REPAIRED, NULL);
+ /*
+ * If anyone does an fmadm flush command, discard any resolved
+ * cases that were being retained for historic diagnosis.
+ */
+ if (fmd_rpc_deny(req))
+ err = FMD_ADM_ERR_PERM;
+ else {
+ fmd_asru_hash_apply_by_asru(fmd.d_asrus, name,
+ fmd_asru_flush, &err);
+ fmd_asru_hash_apply_by_label(fmd.d_asrus, name,
+ fmd_asru_flush, &err);
+ fmd_asru_hash_apply_by_fru(fmd.d_asrus, name,
+ fmd_asru_flush, &err);
+ fmd_asru_hash_apply_by_rsrc(fmd.d_asrus, name,
+ fmd_asru_flush, &err);
+ }
*rvp = err;
return (TRUE);
}
diff --git a/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c b/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c
index 32e9c6504c..ea1e49a1a2 100644
--- a/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c
+++ b/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c
@@ -406,8 +406,8 @@ static void
cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class)
{
char *uuid = NULL;
- nvlist_t **nva;
- uint_t nvc = 0;
+ nvlist_t **nva, **save_nva;
+ uint_t nvc = 0, save_nvc;
uint_t keepopen;
int err = 0;
nvlist_t *asru = NULL;
@@ -421,7 +421,8 @@ cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class)
return;
}
- keepopen = nvc;
+ save_nvc = keepopen = nvc;
+ save_nva = nva;
while (nvc-- != 0 && (strcmp(class, FM_LIST_SUSPECT_CLASS) != 0 ||
!fmd_case_uuclosed(hdl, uuid))) {
nvlist_t *nvl = *nva++;
@@ -455,6 +456,24 @@ cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class)
}
/*
+ * Run though again to catch any new faults in list.updated.
+ */
+ while (save_nvc-- != 0 && (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) {
+ nvlist_t *nvl = *save_nva++;
+ const cma_subscriber_t *subr;
+ int has_fault;
+
+ if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL)
+ continue;
+ if (subr->subr_func != NULL) {
+ has_fault = fmd_nvl_fmri_has_fault(hdl, asru,
+ FMD_HAS_FAULT_ASRU, NULL);
+ if (has_fault == 1)
+ err = subr->subr_func(hdl, nvl, asru, uuid, 0);
+ }
+ }
+
+ /*
* Do not close the case if we are handling cache faults.
*/
if (asru != NULL) {
diff --git a/usr/src/cmd/fm/modules/common/io-retire/rio_main.c b/usr/src/cmd/fm/modules/common/io-retire/rio_main.c
index c5953a70cb..b7509311ed 100644
--- a/usr/src/cmd/fm/modules/common/io-retire/rio_main.c
+++ b/usr/src/cmd/fm/modules/common/io-retire/rio_main.c
@@ -139,7 +139,8 @@ rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
/*
* If disabled, we don't do retire. We still do unretires though
*/
- if (global_disable && strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) {
+ if (global_disable && (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
+ strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) {
fmd_hdl_debug(hdl, "rio_recv: retire disabled\n");
return;
}
@@ -226,6 +227,51 @@ rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
}
}
}
+ /*
+ * Run through again to handle new faults in a list.updated.
+ */
+ for (f = 0; f < nfaults; f++) {
+ if (nvlist_lookup_boolean_value(faults[f], FM_SUSPECT_RETIRE,
+ &rtr) == 0 && !rtr) {
+ fmd_hdl_debug(hdl, "rio_recv: retire suppressed");
+ continue;
+ }
+
+ if (nvlist_lookup_nvlist(faults[f], FM_FAULT_ASRU,
+ &asru) != 0) {
+ fmd_hdl_debug(hdl, "rio_recv: no asru in fault");
+ continue;
+ }
+
+ scheme = NULL;
+ if (nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 ||
+ strcmp(scheme, FM_FMRI_SCHEME_DEV) != 0) {
+ fmd_hdl_debug(hdl, "rio_recv: not \"dev\" scheme: %s",
+ scheme ? scheme : "<NULL>");
+ continue;
+ }
+
+ if (fault_exception(hdl, faults[f]))
+ continue;
+
+ if (nvlist_lookup_string(asru, FM_FMRI_DEV_PATH,
+ &path) != 0 || path[0] == '\0') {
+ fmd_hdl_debug(hdl, "rio_recv: no dev path in asru");
+ continue;
+ }
+
+ if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) {
+ if (fmd_nvl_fmri_has_fault(hdl, asru,
+ FMD_HAS_FAULT_ASRU, NULL) == 1) {
+ error = di_retire_device(path, &drt, 0);
+ if (error != 0) {
+ fmd_hdl_debug(hdl, "rio_recv:"
+ " di_retire_device failed:"
+ " error: %d %s", error, path);
+ }
+ }
+ }
+ }
/*
* Don't send uuclose or uuresolved unless at least one suspect
diff --git a/usr/src/uts/common/sys/fm/protocol.h b/usr/src/uts/common/sys/fm/protocol.h
index bcd3850816..df97ad6038 100644
--- a/usr/src/uts/common/sys/fm/protocol.h
+++ b/usr/src/uts/common/sys/fm/protocol.h
@@ -122,6 +122,7 @@ extern "C" {
#define FM_RSRC_ASRU_REPAIRED "repaired"
#define FM_RSRC_ASRU_REPLACED "replaced"
#define FM_RSRC_ASRU_ACQUITTED "acquitted"
+#define FM_RSRC_ASRU_RESOLVED "resolved"
#define FM_RSRC_ASRU_UNUSABLE "unusable"
#define FM_RSRC_ASRU_EVENT "event"