diff options
author | Stephen Hanson <Stephen.Hanson@Sun.COM> | 2009-09-26 09:41:57 -0700 |
---|---|---|
committer | Stephen Hanson <Stephen.Hanson@Sun.COM> | 2009-09-26 09:41:57 -0700 |
commit | 5750ef5c2584f7399d9b98bfd513c0ca9f79f66e (patch) | |
tree | 15a89d35e4c9fa75fe19a41f4bd8a0cc24413df2 | |
parent | e07b36b5f925802c7a364a9b5dcbd2a8d184e2c6 (diff) | |
download | illumos-gate-5750ef5c2584f7399d9b98bfd513c0ca9f79f66e.tar.gz |
6778240 generic historic diagnosis rules
-rw-r--r-- | usr/src/cmd/fm/fmadm/common/faulty.c | 158 | ||||
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd.c | 7 | ||||
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd_asru.c | 96 | ||||
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd_asru.h | 4 | ||||
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd_case.c | 623 | ||||
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd_case.h | 2 | ||||
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd_protocol.c | 3 | ||||
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd_protocol.h | 2 | ||||
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c | 17 | ||||
-rw-r--r-- | usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c | 25 | ||||
-rw-r--r-- | usr/src/cmd/fm/modules/common/io-retire/rio_main.c | 48 | ||||
-rw-r--r-- | usr/src/uts/common/sys/fm/protocol.h | 1 |
12 files changed, 744 insertions, 242 deletions
diff --git a/usr/src/cmd/fm/fmadm/common/faulty.c b/usr/src/cmd/fm/fmadm/common/faulty.c index be8c98484e..313f36a533 100644 --- a/usr/src/cmd/fm/fmadm/common/faulty.c +++ b/usr/src/cmd/fm/fmadm/common/faulty.c @@ -591,48 +591,6 @@ merge_name_list(name_list_t **list, name_list_t *new, int add_pct) return (rt); } -/* - * compare entries in two lists return true if the two lists have identical - * content. The two lists may not have entries in the same order, so we compare - * the size of the list as well as trying to find every entry from one list in - * the other. - */ -static int -cmp_name_list(name_list_t *lxp1, name_list_t *lxp2) -{ - name_list_t *lp1, *lp2; - int l1 = 0, l2 = 0, common = 0; - - lp2 = lxp2; - while (lp2) { - l2++; - lp2 = lp2->next; - if (lp2 == lxp2) - break; - } - lp1 = lxp1; - while (lp1) { - l1++; - lp2 = lxp2; - while (lp2) { - if (strcmp(lp2->name, lp1->name) == 0) { - common++; - break; - } - lp2 = lp2->next; - if (lp2 == lxp2) - break; - } - lp1 = lp1->next; - if (lp1 == lxp1) - break; - } - if (l1 == l2 && l2 == common) - return (0); - else - return (1); -} - static name_list_t * alloc_name_list(char *name, uint8_t pct) { @@ -650,24 +608,6 @@ alloc_name_list(char *name, uint8_t pct) return (nlp); } -static void -free_name_list(name_list_t *list) -{ - name_list_t *next = list; - name_list_t *lp; - - if (list) { - do { - lp = next; - next = lp->next; - if (lp->label) - free(lp->label); - free(lp->name); - free(lp); - } while (next != list); - } -} - static status_record_t * new_record_init(uurec_t *uurec_p, char *msgid, name_list_t *class, name_list_t *fru, name_list_t *asru, name_list_t *resource, @@ -863,64 +803,6 @@ catalog_new_record(uurec_t *uurec_p, char *msgid, name_list_t *class, add_list(status_rec_p, status_rec_p->asru, &status_asru_list); } -/* - * add uuid and diagnoses time to an existing record for similar fault on the - * same fru - */ -static void -catalog_merge_record(status_record_t *status_rec_p, uurec_t *uurec_p, - name_list_t *asru, name_list_t *resource, name_list_t *serial, - boolean_t not_suppressed) -{ - uurec_t *uurec1_p; - - status_rec_p->nrecs++; - /* add uurec in time order */ - if (status_rec_p->uurec->sec > uurec_p->sec) { - uurec_p->next = status_rec_p->uurec; - uurec_p->prev = NULL; - status_rec_p->uurec = uurec_p; - } else { - uurec1_p = status_rec_p->uurec; - while (uurec1_p->next && uurec1_p->next->sec <= uurec_p->sec) - uurec1_p = uurec1_p->next; - if (uurec1_p->next) - uurec1_p->next->prev = uurec_p; - uurec_p->next = uurec1_p->next; - uurec_p->prev = uurec1_p; - uurec1_p->next = uurec_p; - } - status_rec_p->not_suppressed |= not_suppressed; - uurec_p->asru = merge_name_list(&status_rec_p->asru, asru, 0); - (void) merge_name_list(&status_rec_p->resource, resource, 0); - (void) merge_name_list(&status_rec_p->serial, serial, 0); -} - -static status_record_t * -record_in_catalog(name_list_t *class, name_list_t *fru, - char *msgid, hostid_t *host) -{ - sr_list_t *status_rec_p; - status_record_t *srp = NULL; - - status_rec_p = status_rec_list; - while (status_rec_p) { - srp = status_rec_p->status_record; - if (host == srp->host && - cmp_name_list(class, srp->class) == 0 && - cmp_name_list(fru, srp->fru) == 0 && - strcmp(msgid, srp->msgid) == 0) - break; - if (status_rec_p->next == status_rec_list) { - srp = NULL; - break; - } else { - status_rec_p = status_rec_p->next; - } - } - return (srp); -} - static void get_serial_no(nvlist_t *nvl, name_list_t **serial_p, uint8_t pct) { @@ -993,6 +875,15 @@ extract_record_info(nvlist_t *nvl, name_list_t **class_p, (void) merge_name_list(fru_p, nlp, 1); } get_serial_no(lfru, serial_p, lpct); + } else if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0) { + /* + * No FRU or resource. But we want to display the repair status + * somehow, so create a dummy FRU field. + */ + nlp = alloc_name_list(dgettext("FMD", "None"), lpct); + nlp->status = status & ~(FM_SUSPECT_UNUSABLE | + FM_SUSPECT_DEGRADED); + (void) merge_name_list(fru_p, nlp, 1); } if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &lasru) == 0) { name = get_nvl2str_topo(lasru); @@ -1029,7 +920,6 @@ add_fault_record_to_catalog(nvlist_t *nvl, uint64_t sec, char *uuid) name_list_t *asru = NULL, *fru = NULL, *serial = NULL; nvlist_t **nva; uint8_t *ba; - status_record_t *status_rec_p; uurec_t *uurec_p; hostid_t *host; boolean_t not_suppressed = 1; @@ -1066,19 +956,8 @@ add_fault_record_to_catalog(nvlist_t *nvl, uint64_t sec, char *uuid) uurec_p->event = NULL; (void) nvlist_dup(nvl, &uurec_p->event, 0); host = find_hostid(nvl); - if (not_suppressed && !opt_g) - status_rec_p = NULL; - else - status_rec_p = record_in_catalog(class, fru, msgid, host); - if (status_rec_p) { - catalog_merge_record(status_rec_p, uurec_p, asru, resource, - serial, not_suppressed); - free_name_list(class); - free_name_list(fru); - } else { - catalog_new_record(uurec_p, msgid, class, fru, asru, - resource, serial, not_suppressed, host); - } + catalog_new_record(uurec_p, msgid, class, fru, asru, + resource, serial, not_suppressed, host); } static void @@ -1473,10 +1352,17 @@ print_sup_record(status_record_t *srp, int opt_i, int full) } if (full || srp->fru == NULL || srp->asru == NULL) { if (srp->resource) { - print_name_list(srp->resource, - dgettext("FMD", "Problem in :"), - NULL, full ? 0 : max_display, 0, print_rsrc_status, - full); + status = asru_same_status(srp->resource); + if (status != -1) { + print_name_list(srp->resource, + dgettext("FMD", "Problem in :"), NULL, + full ? 0 : max_display, 0, NULL, full); + print_rsrc_status(status, " "); + } else + print_name_list(srp->resource, + dgettext("FMD", "Problem in :"), + NULL, full ? 0 : max_display, 0, + print_rsrc_status, full); } } if (srp->fru) { diff --git a/usr/src/cmd/fm/fmd/common/fmd.c b/usr/src/cmd/fm/fmd/common/fmd.c index e904c2c74d..fe61e14a79 100644 --- a/usr/src/cmd/fm/fmd/common/fmd.c +++ b/usr/src/cmd/fm/fmd/common/fmd.c @@ -288,6 +288,7 @@ static const fmd_conf_formal_t _fmd_conf[] = { { "rpc.api.prog", &fmd_conf_uint32, "100170" }, /* FMD_API rpc program num */ { "rpc.rcvsize", &fmd_conf_size, "128k" }, /* rpc receive buffer size */ { "rpc.sndsize", &fmd_conf_size, "128k" }, /* rpc send buffer size */ +{ "rsrc.pollperiod", &fmd_conf_time, "1h" }, /* aged rsrcs poller period */ { "rsrc.age", &fmd_conf_time, "30d" }, /* max age of old rsrc log */ { "rsrc.zero", &fmd_conf_bool, "false" }, /* zero rsrc cache on start? */ { "schemedir", &fmd_conf_string, _fmd_scheme_path }, /* path for scheme mods */ @@ -705,12 +706,12 @@ fmd_gc(fmd_t *dp, id_t id, hrtime_t hrt) static void fmd_clear_aged_rsrcs(fmd_t *dp, id_t id, hrtime_t hrt) { - hrtime_t delta; + hrtime_t period; fmd_asru_clear_aged_rsrcs(); - (void) fmd_conf_getprop(dp->d_conf, "rsrc.age", &delta); + (void) fmd_conf_getprop(dp->d_conf, "rsrc.pollperiod", &period); (void) fmd_timerq_install(dp->d_timers, dp->d_rmod->mod_timerids, - (fmd_timer_f *)fmd_clear_aged_rsrcs, dp, NULL, delta/10); + (fmd_timer_f *)fmd_clear_aged_rsrcs, dp, NULL, period); } /* diff --git a/usr/src/cmd/fm/fmd/common/fmd_asru.c b/usr/src/cmd/fm/fmd/common/fmd_asru.c index 07a98d51fe..31e98168e0 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_asru.c +++ b/usr/src/cmd/fm/fmd/common/fmd_asru.c @@ -441,7 +441,7 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) boolean_t faulty = FMD_B_FALSE, unusable = FMD_B_FALSE; int ps; boolean_t repaired = FMD_B_FALSE, replaced = FMD_B_FALSE; - boolean_t acquitted = FMD_B_FALSE; + boolean_t acquitted = FMD_B_FALSE, resolved = FMD_B_FALSE; nvlist_t *flt, *flt_copy, *asru; char *case_uuid = NULL, *case_code = NULL; fmd_asru_t *ap; @@ -481,17 +481,20 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) &replaced); (void) nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_ACQUITTED, &acquitted); + (void) nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_RESOLVED, + &resolved); /* - * Attempt to recreate the case in either the CLOSED or REPAIRED state - * (depending on whether the faulty bit is still set). + * Attempt to recreate the case in CLOSED, REPAIRED or RESOLVED state + * (depending on whether the faulty/resolved bits are set). * If the case is already present, fmd_case_recreate() will return it. * If not, we'll create a new orphaned case. Either way, we use the * ASRU event to insert a suspect into the partially-restored case. */ fmd_module_lock(fmd.d_rmod); cp = fmd_case_recreate(fmd.d_rmod, NULL, faulty ? FMD_CASE_CLOSED : - FMD_CASE_REPAIRED, case_uuid, case_code); + resolved ? FMD_CASE_RESOLVED : FMD_CASE_REPAIRED, case_uuid, + case_code); fmd_case_hold(cp); fmd_module_unlock(fmd.d_rmod); if (nvlist_lookup_int64_array(nvl, FM_SUSPECT_DIAG_TIME, &diag_time, @@ -581,6 +584,8 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) alp->al_reason = FMD_ASRU_REPAIRED; else if (acquitted) alp->al_reason = FMD_ASRU_ACQUITTED; + else + alp->al_reason = FMD_ASRU_REMOVED; TRACE((FMD_DBG_ASRU, "asru %s recreated as %p (%s)", alp->al_uuid, (void *)ap, _fmd_asru_snames[ap->asru_flags & FMD_ASRU_STATE])); @@ -712,6 +717,9 @@ fmd_asru_repair_if_aged(fmd_asru_link_t *alp, void *arg) int err; fmd_asru_rep_arg_t fara; + if (!(alp->al_flags & FMD_ASRU_FAULTY)) + return; + /* * Checking for aged resources only happens on the diagnosing side * not on a proxy. @@ -740,10 +748,55 @@ fmd_asru_repair_if_aged(fmd_asru_link_t *alp, void *arg) } } +/*ARGSUSED*/ +void +fmd_asru_check_if_aged(fmd_asru_link_t *alp, void *arg) +{ + struct timeval tv; + fmd_log_t *lp; + hrtime_t hrt; + + /* + * Case must be in resolved state for this to be called. So modified + * time on resource cache entry should be the time the resolve occurred. + * Return 0 if not yet hit rsrc.aged. + */ + fmd_time_gettimeofday(&tv); + lp = fmd_log_open(alp->al_asru->asru_root, alp->al_uuid, FMD_LOG_ASRU); + if (lp == NULL) + return; + hrt = (hrtime_t)(tv.tv_sec - lp->log_stat.st_mtime); + fmd_log_rele(lp); + if (hrt * NANOSEC < fmd.d_asrus->ah_lifetime) + *(int *)arg = 0; +} + +/*ARGSUSED*/ +void +fmd_asru_most_recent(fmd_asru_link_t *alp, void *arg) +{ + fmd_log_t *lp; + uint64_t hrt; + + /* + * Find most recent modified time of a set of resource cache entries. + */ + lp = fmd_log_open(alp->al_asru->asru_root, alp->al_uuid, FMD_LOG_ASRU); + if (lp == NULL) + return; + hrt = lp->log_stat.st_mtime; + fmd_log_rele(lp); + if (*(uint64_t *)arg < hrt) + *(uint64_t *)arg = hrt; +} + void fmd_asru_clear_aged_rsrcs() { + int check_if_aged = 1; fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_repair_if_aged, NULL); + fmd_case_hash_apply(fmd.d_cases, fmd_case_discard_resolved, + &check_if_aged); } fmd_asru_hash_t * @@ -1298,6 +1351,22 @@ fmd_asru_repaired(fmd_asru_link_t *alp, void *arg) } /* + * Discard the case associated with this alp if it is in resolved state. + * Called on "fmadm flush". + */ +/*ARGSUSED*/ +void +fmd_asru_flush(fmd_asru_link_t *alp, void *arg) +{ + int check_if_aged = 0; + int *rval = (int *)arg; + + if (alp->al_case) + fmd_case_discard_resolved(alp->al_case, &check_if_aged); + *rval = 0; +} + +/* * This is only called for proxied faults. Set various flags so we can * find the nature of the transport from the resource cache code. */ @@ -1459,7 +1528,8 @@ fmd_asru_logevent(fmd_asru_link_t *alp) nvl = fmd_protocol_rsrc_asru(_fmd_asru_events[faulty | (unusable << 1)], alp->al_asru_fmri, cip->ci_uuid, cip->ci_code, faulty, unusable, message, alp->al_event, &cip->ci_tv, repaired, replaced, acquitted, - cip->ci_diag_de == NULL ? cip->ci_mod->mod_fmri : cip->ci_diag_de); + cip->ci_state == FMD_CASE_RESOLVED, cip->ci_diag_de == NULL ? + cip->ci_mod->mod_fmri : cip->ci_diag_de); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); @@ -1525,7 +1595,9 @@ fmd_asru_clrflags(fmd_asru_link_t *alp, uint_t sflag, uint8_t reason) nstate = alp->al_flags & FMD_ASRU_STATE; if (nstate == ostate) { - if (reason > alp->al_reason) { + if (reason > alp->al_reason && + ((fmd_case_impl_t *)alp->al_case)->ci_state < + FMD_CASE_REPAIRED) { alp->al_reason = reason; fmd_asru_logevent(alp); (void) pthread_cond_broadcast(&ap->asru_cv); @@ -1560,6 +1632,18 @@ fmd_asru_clrflags(fmd_asru_link_t *alp, uint_t sflag, uint8_t reason) return (1); } +/*ARGSUSED*/ +void +fmd_asru_log_resolved(fmd_asru_link_t *alp, void *unused) +{ + fmd_asru_t *ap = alp->al_asru; + + (void) pthread_mutex_lock(&ap->asru_lock); + fmd_asru_logevent(alp); + (void) pthread_cond_broadcast(&ap->asru_cv); + (void) pthread_mutex_unlock(&ap->asru_lock); +} + /* * Report the current known state of the link entry (ie this particular fault * affecting this particular ASRU). diff --git a/usr/src/cmd/fm/fmd/common/fmd_asru.h b/usr/src/cmd/fm/fmd/common/fmd_asru.h index 4d3bc8042a..c20f376df2 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_asru.h +++ b/usr/src/cmd/fm/fmd/common/fmd_asru.h @@ -195,6 +195,7 @@ typedef struct { char *fara_uuid; /* uuid can be passed in for comparison */ } fmd_asru_rep_arg_t; extern void fmd_asru_repaired(fmd_asru_link_t *, void *); +extern void fmd_asru_flush(fmd_asru_link_t *, void *); typedef struct { int *faus_countp; @@ -225,8 +226,11 @@ extern void fmd_asru_close_status(fmd_asru_link_t *alp, void *arg); extern int fmd_asru_setflags(fmd_asru_link_t *, uint_t); extern int fmd_asru_clrflags(fmd_asru_link_t *, uint_t, uint8_t); +extern void fmd_asru_log_resolved(fmd_asru_link_t *, void *); extern int fmd_asru_al_getstate(fmd_asru_link_t *); extern int fmd_asru_getstate(fmd_asru_t *); +extern void fmd_asru_check_if_aged(fmd_asru_link_t *, void *); +void fmd_asru_most_recent(fmd_asru_link_t *, void *); #ifdef __cplusplus } diff --git a/usr/src/cmd/fm/fmd/common/fmd_case.c b/usr/src/cmd/fm/fmd/common/fmd_case.c index 8a03b670eb..5e40c593b2 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_case.c +++ b/usr/src/cmd/fm/fmd/common/fmd_case.c @@ -220,6 +220,39 @@ fmd_case_hash_apply(fmd_case_hash_t *chp, } static void +fmd_case_hash_apply_except_current(fmd_case_hash_t *chp, + void (*func)(fmd_case_t *, void *), void *arg, fmd_case_t *current) +{ + fmd_case_impl_t *cp, **cps, **cpp; + uint_t cpc, i; + + (void) pthread_rwlock_rdlock(&chp->ch_lock); + + cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); + cpc = chp->ch_count; + + for (i = 0; i < chp->ch_hashlen; i++) { + for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) + if (cp != (fmd_case_impl_t *)current) + *cpp++ = fmd_case_tryhold(cp); + else + *cpp++ = cp; + } + + ASSERT(cpp == cps + cpc); + (void) pthread_rwlock_unlock(&chp->ch_lock); + + for (i = 0; i < cpc; i++) { + if (cps[i] != NULL && cps[i] != (fmd_case_impl_t *)current) { + func((fmd_case_t *)cps[i], arg); + fmd_case_rele((fmd_case_t *)cps[i]); + } + } + + fmd_free(cps, cpc * sizeof (fmd_case_t *)); +} + +static void fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) { uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; @@ -453,6 +486,12 @@ fmd_case_mkevent(fmd_case_t *cp, const char *class) return (nvl); } +static int fmd_case_match_on_faulty_overlap = 1; +static int fmd_case_match_on_acquit_overlap = 1; +static int fmd_case_auto_acquit_isolated = 1; +static int fmd_case_auto_acquit_non_acquitted = 1; +static int fmd_case_too_recent = 10; /* time in seconds */ + static boolean_t fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) { @@ -498,82 +537,377 @@ done: } static int -fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis) +fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2) { char *class, *new_class; - if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU)) + if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU)) return (0); - if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, - FM_FAULT_RESOURCE)) + if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE)) return (0); - if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU)) + if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU)) return (0); - (void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class); - (void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class); + (void) nvlist_lookup_string(nvl2, FM_CLASS, &class); + (void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class); return (strcmp(class, new_class) == 0); } +typedef struct { + int *fcms_countp; + int fcms_maxcount; + fmd_case_impl_t *fcms_cip; + uint8_t *fcms_new_susp_state; + uint8_t *fcms_old_susp_state; + uint8_t *fcms_old_match_state; +} fcms_t; +#define SUSPECT_STATE_FAULTY 0x1 +#define SUSPECT_STATE_ISOLATED 0x2 +#define SUSPECT_STATE_REMOVED 0x4 +#define SUSPECT_STATE_ACQUITED 0x8 +#define SUSPECT_STATE_REPAIRED 0x10 +#define SUSPECT_STATE_REPLACED 0x20 +#define SUSPECT_STATE_NO_MATCH 0x1 + /* - * see if an identical suspect list already exists in the cache + * This is called for each suspect in the old case. Compare it against each + * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state + * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not + * found in the old case. */ -static int -fmd_case_check_for_dups(fmd_case_t *cp) +static void +fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg) { - fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip; - fmd_case_hash_t *chp = fmd.d_cases; - fmd_case_susp_t *xcis, *cis; - int match = 0, match_susp; - uint_t h; + fcms_t *fcmsp = (fcms_t *)arg; + fmd_case_impl_t *cip = fcmsp->fcms_cip; + fmd_case_susp_t *cis; + int i = 0; + int state = fmd_asru_al_getstate(alp); - (void) pthread_rwlock_rdlock(&chp->ch_lock); + if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount) + return; + + if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) && + alp->al_reason == FMD_ASRU_REMOVED)) + fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = + SUSPECT_STATE_REMOVED; + else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY)) + fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = + SUSPECT_STATE_ISOLATED; + else if (state & FMD_ASRU_FAULTY) + fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = + SUSPECT_STATE_FAULTY; + else if (alp->al_reason == FMD_ASRU_REPLACED) + fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = + SUSPECT_STATE_REPLACED; + else if (alp->al_reason == FMD_ASRU_ACQUITTED) + fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = + SUSPECT_STATE_ACQUITED; + else + fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = + SUSPECT_STATE_REPAIRED; + + for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++) + if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1) + break; + if (cis != NULL) + fcmsp->fcms_new_susp_state[i] = + fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp]; + else + fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |= + SUSPECT_STATE_NO_MATCH; + (*fcmsp->fcms_countp)++; +} + +typedef struct { + int *fca_do_update; + fmd_case_impl_t *fca_cip; +} fca_t; + +/* + * Re-fault all acquitted suspects that are still present in the new list. + */ +static void +fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg) +{ + fca_t *fcap = (fca_t *)arg; + fmd_case_impl_t *cip = fcap->fca_cip; + fmd_case_susp_t *cis; + int state = fmd_asru_al_getstate(alp); + + if (!(state & FMD_ASRU_FAULTY) && + alp->al_reason == FMD_ASRU_ACQUITTED) { + for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) + if (fmd_case_match_suspect(cis->cis_nvl, + alp->al_event) == 1) + break; + if (cis != NULL) { + (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); + *fcap->fca_do_update = 1; + } + } +} + +/* + * Re-fault all suspects that are still present in the new list. + */ +static void +fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg) +{ + fca_t *fcap = (fca_t *)arg; + fmd_case_impl_t *cip = fcap->fca_cip; + fmd_case_susp_t *cis; + int state = fmd_asru_al_getstate(alp); + + if (!(state & FMD_ASRU_FAULTY)) { + for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) + if (fmd_case_match_suspect(cis->cis_nvl, + alp->al_event) == 1) + break; + if (cis != NULL) { + (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); + *fcap->fca_do_update = 1; + } + } +} + +/* + * Acquit all suspects that are no longer present in the new list. + */ +static void +fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg) +{ + fca_t *fcap = (fca_t *)arg; + fmd_case_impl_t *cip = fcap->fca_cip; + fmd_case_susp_t *cis; + int state = fmd_asru_al_getstate(alp); + + if (state & FMD_ASRU_FAULTY) { + for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) + if (fmd_case_match_suspect(cis->cis_nvl, + alp->al_event) == 1) + break; + if (cis == NULL) { + (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + FMD_ASRU_ACQUITTED); + *fcap->fca_do_update = 1; + } + } +} + +/* + * Acquit all isolated suspects. + */ +static void +fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg) +{ + int *do_update = (int *)arg; + int state = fmd_asru_al_getstate(alp); + + if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) && + (state & FMD_ASRU_FAULTY)) { + (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + FMD_ASRU_ACQUITTED); + *do_update = 1; + } +} + +/* + * Acquit suspect which matches specified nvlist + */ +static void +fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg) +{ + nvlist_t *nvl = (nvlist_t *)arg; + int state = fmd_asru_al_getstate(alp); + + if ((state & FMD_ASRU_FAULTY) && + fmd_case_match_suspect(nvl, alp->al_event) == 1) + (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + FMD_ASRU_ACQUITTED); +} + +typedef struct { + fmd_case_impl_t *fccd_cip; + uint8_t *fccd_new_susp_state; + uint8_t *fccd_new_match_state; + int *fccd_discard_new; + int *fccd_adjust_new; +} fccd_t; + +/* + * see if a matching suspect list already exists in the cache + */ +static void +fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg) +{ + fccd_t *fccdp = (fccd_t *)arg; + fmd_case_impl_t *new_cip = fccdp->fccd_cip; + fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp; + int i, count = 0, do_update = 0, got_isolated_overlap = 0; + int got_faulty_overlap = 0; + int got_acquit_overlap = 0; + boolean_t too_recent; + uint64_t most_recent = 0; + fcms_t fcms; + fca_t fca; + uint8_t *new_susp_state; + uint8_t *old_susp_state; + uint8_t *old_match_state; + + new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t)); + for (i = 0; i < new_cip->ci_nsuspects; i++) + new_susp_state[i] = 0; + old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); + for (i = 0; i < old_cip->ci_nsuspects; i++) + old_susp_state[i] = 0; + old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); + for (i = 0; i < old_cip->ci_nsuspects; i++) + old_match_state[i] = 0; /* - * Find all cases with this code + * Compare with each suspect in the existing case. */ - h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; - for (xcip = chp->ch_code_hash[h]; xcip != NULL; - xcip = xcip->ci_code_next) { - /* - * only look for any cases (apart from this one) - * whose code and number of suspects match - */ - if (xcip == cip || fmd_case_tryhold(xcip) == NULL) - continue; - if (strcmp(xcip->ci_code, cip->ci_code) != 0 || - xcip->ci_nsuspects != cip->ci_nsuspects) { - fmd_case_rele((fmd_case_t *)xcip); - continue; + fcms.fcms_countp = &count; + fcms.fcms_maxcount = old_cip->ci_nsuspects; + fcms.fcms_cip = new_cip; + fcms.fcms_new_susp_state = new_susp_state; + fcms.fcms_old_susp_state = old_susp_state; + fcms.fcms_old_match_state = old_match_state; + fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip, + fmd_case_match_suspects, &fcms); + + /* + * If we have some faulty, non-isolated suspects that overlap, then most + * likely it is the suspects that overlap in the suspect lists that are + * to blame. So we can consider this to be a match. + */ + for (i = 0; i < new_cip->ci_nsuspects; i++) + if (new_susp_state[i] == SUSPECT_STATE_FAULTY) + got_faulty_overlap = 1; + if (got_faulty_overlap && fmd_case_match_on_faulty_overlap) + goto got_match; + + /* + * If we have no faulty, non-isolated suspects in the old case, but we + * do have some acquitted suspects that overlap, then most likely it is + * the acquitted suspects that overlap in the suspect lists that are + * to blame. So we can consider this to be a match. + */ + for (i = 0; i < new_cip->ci_nsuspects; i++) + if (new_susp_state[i] == SUSPECT_STATE_ACQUITED) + got_acquit_overlap = 1; + for (i = 0; i < old_cip->ci_nsuspects; i++) + if (old_susp_state[i] == SUSPECT_STATE_FAULTY) + got_acquit_overlap = 0; + if (got_acquit_overlap && fmd_case_match_on_acquit_overlap) + goto got_match; + + /* + * Check that all suspects in the new list are present in the old list. + * Return if we find one that isn't. + */ + for (i = 0; i < new_cip->ci_nsuspects; i++) + if (new_susp_state[i] == 0) + return; + + /* + * Check that all suspects in the old list are present in the new list + * *or* they are isolated or removed/replaced (which would explain why + * they are not present in the new list). Return if we find one that is + * faulty and unisolated or repaired or acquitted, and that is not + * present in the new case. + */ + for (i = 0; i < old_cip->ci_nsuspects; i++) + if (old_match_state[i] == SUSPECT_STATE_NO_MATCH && + (old_susp_state[i] == SUSPECT_STATE_FAULTY || + old_susp_state[i] == SUSPECT_STATE_ACQUITED || + old_susp_state[i] == SUSPECT_STATE_REPAIRED)) + return; + +got_match: + /* + * If the old case is already in repaired/resolved state, we can't + * do anything more with it, so keep the new case, but acquit some + * of the suspects if appropriate. + */ + if (old_cip->ci_state >= FMD_CASE_REPAIRED) { + if (fmd_case_auto_acquit_non_acquitted) { + *fccdp->fccd_adjust_new = 1; + for (i = 0; i < new_cip->ci_nsuspects; i++) { + fccdp->fccd_new_susp_state[i] |= + new_susp_state[i]; + if (new_susp_state[i] == 0) + fccdp->fccd_new_susp_state[i] = + SUSPECT_STATE_NO_MATCH; + } } + return; + } + /* + * Otherwise discard the new case and keep the old, again updating the + * state of the suspects as appropriate + */ + *fccdp->fccd_discard_new = 1; + fca.fca_cip = new_cip; + fca.fca_do_update = &do_update; + + /* + * See if new case occurred within fmd_case_too_recent seconds of the + * most recent modification to the old case and if so don't do + * auto-acquit. This avoids problems if a flood of ereports come in and + * they don't all get diagnosed before the first case causes some of + * the devices to be isolated making it appear that an isolated device + * was in the suspect list. + */ + fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, + fmd_asru_most_recent, &most_recent); + too_recent = (new_cip->ci_tv.tv_sec - most_recent < + fmd_case_too_recent); + + if (got_faulty_overlap) { /* - * For each suspect in one list, check if there - * is an identical suspect in the other list + * Acquit any suspects not present in the new list, plus + * any that are are present but are isolated. */ - match = 1; - for (xcis = xcip->ci_suspects; xcis != NULL; - xcis = xcis->cis_next) { - match_susp = 0; - for (cis = cip->ci_suspects; cis != NULL; - cis = cis->cis_next) { - if (fmd_case_match_suspect(cis, xcis) == 1) { - match_susp = 1; - break; - } - } - if (match_susp == 0) { - match = 0; - break; - } - } - fmd_case_rele((fmd_case_t *)xcip); - if (match) { - (void) pthread_rwlock_unlock(&chp->ch_lock); - return (1); + fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, + fmd_case_acquit_no_match, &fca); + if (fmd_case_auto_acquit_isolated && !too_recent) + fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, + fmd_case_acquit_isolated, &do_update); + } else if (got_acquit_overlap) { + /* + * Re-fault the acquitted matching suspects and acquit all + * isolated suspects. + */ + if (fmd_case_auto_acquit_isolated && !too_recent) { + fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, + fmd_case_fault_acquitted_matching, &fca); + fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, + fmd_case_acquit_isolated, &do_update); } + } else if (fmd_case_auto_acquit_isolated) { + /* + * To get here, there must be no faulty or acquitted suspects, + * but there must be at least one isolated suspect. Just acquit + * non-matching isolated suspects. If there are no matching + * isolated suspects, then re-fault all matching suspects. + */ + for (i = 0; i < new_cip->ci_nsuspects; i++) + if (new_susp_state[i] == SUSPECT_STATE_ISOLATED) + got_isolated_overlap = 1; + if (!got_isolated_overlap) + fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, + fmd_case_fault_all_matching, &fca); + fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, + fmd_case_acquit_no_match, &fca); } - (void) pthread_rwlock_unlock(&chp->ch_lock); - return (0); + + /* + * If we've updated anything in the old case, call fmd_case_update() + */ + if (do_update) + fmd_case_update(old_cp); } /* @@ -610,22 +944,49 @@ fmd_case_convict(fmd_case_t *cp) { fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; fmd_asru_hash_t *ahp = fmd.d_asrus; - + int discard_new = 0, i; fmd_case_susp_t *cis; fmd_asru_link_t *alp; + uint8_t *new_susp_state; + uint8_t *new_match_state; + int adjust_new = 0; + fccd_t fccd; (void) pthread_mutex_lock(&cip->ci_lock); if (cip->ci_code == NULL) (void) fmd_case_mkcode(cp); else if (cip->ci_precanned) fmd_case_code_hash_insert(fmd.d_cases, cip); - if (fmd_case_check_for_dups(cp) == 1) { + + /* + * First we must see if any matching cases already exist. + */ + new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); + for (i = 0; i < cip->ci_nsuspects; i++) + new_susp_state[i] = 0; + new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); + for (i = 0; i < cip->ci_nsuspects; i++) + new_match_state[i] = 0; + fccd.fccd_cip = cip; + fccd.fccd_adjust_new = &adjust_new; + fccd.fccd_new_susp_state = new_susp_state; + fccd.fccd_new_match_state = new_match_state; + fccd.fccd_discard_new = &discard_new; + fmd_case_hash_apply_except_current(fmd.d_cases, fmd_case_check_for_dups, + &fccd, cp); + + if (discard_new) { + /* + * We've found an existing case that is a match and it is not + * already in repaired or resolved state. So we can close this + * one as a duplicate. + */ (void) pthread_mutex_unlock(&cip->ci_lock); return (1); } /* - * no suspect list already exists - allocate new cache entries + * Allocate new cache entries */ for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { if ((alp = fmd_asru_hash_create_entry(ahp, @@ -640,6 +1001,45 @@ fmd_case_convict(fmd_case_t *cp) (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); } + if (adjust_new) { + int some_suspect = 0, some_not_suspect = 0; + + /* + * There is one or more matching case but they are already in + * repaired or resolved state. So we need to keep the new + * case, but we can adjust it. Repaired/removed/replaced + * suspects are unlikely to be to blame (unless there are + * actually two separate faults). So if we have a combination of + * repaired/replaced/removed suspects and acquitted suspects in + * the old lists, then we should acquit in the new list those + * that were repaired/replaced/removed in the old. + */ + for (i = 0; i < cip->ci_nsuspects; i++) { + if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) || + (new_susp_state[i] & SUSPECT_STATE_REPAIRED) || + (new_susp_state[i] & SUSPECT_STATE_REMOVED) || + (new_match_state[i] & SUSPECT_STATE_NO_MATCH)) + some_not_suspect = 1; + else + some_suspect = 1; + } + if (some_suspect && some_not_suspect) { + for (cis = cip->ci_suspects, i = 0; cis != NULL; + cis = cis->cis_next, i++) + if ((new_susp_state[i] & + SUSPECT_STATE_REPLACED) || + (new_susp_state[i] & + SUSPECT_STATE_REPAIRED) || + (new_susp_state[i] & + SUSPECT_STATE_REMOVED) || + (new_match_state[i] & + SUSPECT_STATE_NO_MATCH)) + fmd_asru_hash_apply_by_case(fmd.d_asrus, + cp, fmd_case_acquit_suspect, + cis->cis_nvl); + } + } + (void) pthread_mutex_unlock(&cip->ci_lock); return (0); } @@ -934,8 +1334,6 @@ fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); fmd_case_impl_t *eip; - ASSERT(state < FMD_CASE_RESOLVED); - (void) pthread_mutex_init(&cip->ci_lock, NULL); fmd_buf_hash_create(&cip->ci_bufs); @@ -987,11 +1385,12 @@ fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, /* * When recreating an orphan case, state passed in may - * either be CLOSED (faulty) or REPAIRED (!faulty). If + * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If * any suspects are still CLOSED (faulty) then the * overall state needs to be CLOSED. */ - if (cip->ci_state == FMD_CASE_REPAIRED && + if ((cip->ci_state == FMD_CASE_REPAIRED || + cip->ci_state == FMD_CASE_RESOLVED) && state == FMD_CASE_CLOSED) cip->ci_state = FMD_CASE_CLOSED; (void) pthread_mutex_unlock(&cip->ci_lock); @@ -1397,13 +1796,8 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) * using fmd_xprt_uuresolved(). */ if (flags & FMD_CF_RESOLVED) { - if (cip->ci_xprt != NULL) { + if (cip->ci_xprt != NULL) fmd_list_delete(&cip->ci_mod->mod_cases, cip); - } else { - fmd_module_lock(cip->ci_mod); - fmd_list_delete(&cip->ci_mod->mod_cases, cip); - fmd_module_unlock(cip->ci_mod); - } } else { fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_unusable_and_present, @@ -1414,9 +1808,6 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid); break; } - fmd_module_lock(cip->ci_mod); - fmd_list_delete(&cip->ci_mod->mod_cases, cip); - fmd_module_unlock(cip->ci_mod); } cip->ci_state = FMD_CASE_RESOLVED; @@ -1455,9 +1846,6 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) return; } - fmd_module_lock(cip->ci_mod); - fmd_list_delete(&cip->ci_mod->mod_cases, cip); - fmd_module_unlock(cip->ci_mod); resolved = 1; break; } @@ -1482,17 +1870,73 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) } if (resolved) { - /* - * If we transitioned to RESOLVED, adjust the reference count to - * reflect our removal from fmd.d_rmod->mod_cases above. If the - * caller has not placed an additional hold on the case, it - * will now be freed. - */ - (void) pthread_mutex_lock(&cip->ci_lock); - fmd_asru_hash_delete_case(fmd.d_asrus, cp); + if (cip->ci_xprt != NULL) { + /* + * If we transitioned to RESOLVED, adjust the reference + * count to reflect our removal from + * fmd.d_rmod->mod_cases above. If the caller has not + * placed an additional hold on the case, it will now + * be freed. + */ + (void) pthread_mutex_lock(&cip->ci_lock); + fmd_asru_hash_delete_case(fmd.d_asrus, cp); + (void) pthread_mutex_unlock(&cip->ci_lock); + fmd_case_rele(cp); + } else { + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, + fmd_asru_log_resolved, NULL); + (void) pthread_mutex_lock(&cip->ci_lock); + /* mark as "ready to be discarded */ + cip->ci_flags |= FMD_CF_RES_CMPL; + (void) pthread_mutex_unlock(&cip->ci_lock); + } + } +} + +/* + * Discard any case if it is in RESOLVED state (and if check_if_aged argument + * is set if all suspects have passed the rsrc.aged time). + */ +void +fmd_case_discard_resolved(fmd_case_t *cp, void *arg) +{ + int check_if_aged = *(int *)arg; + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + + /* + * First check if case has completed transition to resolved. + */ + (void) pthread_mutex_lock(&cip->ci_lock); + if (!(cip->ci_flags & FMD_CF_RES_CMPL)) { (void) pthread_mutex_unlock(&cip->ci_lock); - fmd_case_rele(cp); + return; + } + + /* + * Now if check_is_aged is set, see if all suspects have aged. + */ + if (check_if_aged) { + int aged = 1; + + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, + fmd_asru_check_if_aged, &aged); + if (!aged) { + (void) pthread_mutex_unlock(&cip->ci_lock); + return; + } } + + /* + * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't + * do it twice. + */ + fmd_module_lock(cip->ci_mod); + fmd_list_delete(&cip->ci_mod->mod_cases, cip); + fmd_module_unlock(cip->ci_mod); + fmd_asru_hash_delete_case(fmd.d_asrus, cp); + cip->ci_flags &= ~FMD_CF_RES_CMPL; + (void) pthread_mutex_unlock(&cip->ci_lock); + fmd_case_rele(cp); } /* @@ -1964,7 +2408,7 @@ fmd_case_setcode(fmd_case_t *cp, char *code) } /*ARGSUSED*/ -void +static void fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) { int not_faulty = 0; @@ -1978,6 +2422,11 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL) return; + if (cip->ci_state == FMD_CASE_RESOLVED) { + cip->ci_flags |= FMD_CF_RES_CMPL; + return; + } + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty, ¬_faulty); @@ -1991,9 +2440,6 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_unusable_and_present, &any_unusable_and_present); if (!any_unusable_and_present) { - fmd_module_lock(cip->ci_mod); - fmd_list_delete(&cip->ci_mod->mod_cases, cip); - fmd_module_unlock(cip->ci_mod); cip->ci_state = FMD_CASE_RESOLVED; TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", @@ -2007,10 +2453,7 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) TRACE((FMD_DBG_CASE, "replay sending list.resolved %s", cip->ci_uuid)); fmd_case_publish(cp, FMD_CASE_RESOLVED); - (void) pthread_mutex_lock(&cip->ci_lock); - fmd_asru_hash_delete_case(fmd.d_asrus, cp); - (void) pthread_mutex_unlock(&cip->ci_lock); - fmd_case_rele(cp); + cip->ci_flags |= FMD_CF_RES_CMPL; } else { TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", cip->ci_uuid)); diff --git a/usr/src/cmd/fm/fmd/common/fmd_case.h b/usr/src/cmd/fm/fmd/common/fmd_case.h index 354e3f35a1..6e7b3ced54 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_case.h +++ b/usr/src/cmd/fm/fmd/common/fmd_case.h @@ -96,6 +96,7 @@ typedef struct fmd_case_impl { #define FMD_CF_RESOLVED 0x10 /* case has been resolved */ #define FMD_CF_INVISIBLE 0x20 /* case should be invisible */ #define FMD_CF_DELETING 0x40 /* case is about to be deleted */ +#define FMD_CF_RES_CMPL 0x80 /* transition to resolved is complete */ /* * ci_proxy_asru flags record if we created a new asru on the proxy side and @@ -160,6 +161,7 @@ extern int fmd_case_acquit(fmd_case_t *); extern int fmd_case_contains(fmd_case_t *, fmd_event_t *); extern int fmd_case_orphaned(fmd_case_t *); extern void fmd_case_repair_replay(void); +extern void fmd_case_discard_resolved(fmd_case_t *, void *); #ifdef __cplusplus } diff --git a/usr/src/cmd/fm/fmd/common/fmd_protocol.c b/usr/src/cmd/fm/fmd/common/fmd_protocol.c index 0a477a35d1..5e607486c8 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_protocol.c +++ b/usr/src/cmd/fm/fmd/common/fmd_protocol.c @@ -206,7 +206,7 @@ fmd_protocol_rsrc_asru(const char *class, nvlist_t *fmri, const char *uuid, const char *code, boolean_t faulty, boolean_t unusable, boolean_t message, nvlist_t *event, struct timeval *tvp, boolean_t repaired, boolean_t replaced, - boolean_t acquitted, nvlist_t *diag_de) + boolean_t acquitted, boolean_t resolved, nvlist_t *diag_de) { nvlist_t *nvl; int64_t tod[2]; @@ -233,6 +233,7 @@ fmd_protocol_rsrc_asru(const char *class, err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_REPAIRED, repaired); err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_REPLACED, replaced); err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_ACQUITTED, acquitted); + err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_RESOLVED, resolved); err |= nvlist_add_boolean_value(nvl, FM_RSRC_ASRU_UNUSABLE, unusable); err |= nvlist_add_boolean_value(nvl, FM_SUSPECT_MESSAGE, message); err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); diff --git a/usr/src/cmd/fm/fmd/common/fmd_protocol.h b/usr/src/cmd/fm/fmd/common/fmd_protocol.h index 927a875ec3..c8f8dda280 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_protocol.h +++ b/usr/src/cmd/fm/fmd/common/fmd_protocol.h @@ -74,7 +74,7 @@ extern nvlist_t *fmd_protocol_list(const char *, nvlist_t *, struct timeval *); extern nvlist_t *fmd_protocol_rsrc_asru(const char *, nvlist_t *, const char *, const char *, boolean_t, boolean_t, boolean_t, nvlist_t *, - struct timeval *m, boolean_t, boolean_t, boolean_t, nvlist_t *); + struct timeval *m, boolean_t, boolean_t, boolean_t, boolean_t, nvlist_t *); extern nvlist_t *fmd_protocol_fmderror(int, const char *, va_list); extern nvlist_t *fmd_protocol_moderror(struct fmd_module *, int, const char *); extern nvlist_t *fmd_protocol_xprt_ctl(struct fmd_module *, diff --git a/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c b/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c index 47671d13ae..5c0a1e5e51 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c +++ b/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c @@ -500,7 +500,22 @@ fmd_adm_rsrcflush_1_svc(char *name, int *rvp, struct svc_req *req) { int err = FMD_ADM_ERR_RSRCNOTF; - fmd_adm_do_repair(name, req, &err, FMD_ASRU_REPAIRED, NULL); + /* + * If anyone does an fmadm flush command, discard any resolved + * cases that were being retained for historic diagnosis. + */ + if (fmd_rpc_deny(req)) + err = FMD_ADM_ERR_PERM; + else { + fmd_asru_hash_apply_by_asru(fmd.d_asrus, name, + fmd_asru_flush, &err); + fmd_asru_hash_apply_by_label(fmd.d_asrus, name, + fmd_asru_flush, &err); + fmd_asru_hash_apply_by_fru(fmd.d_asrus, name, + fmd_asru_flush, &err); + fmd_asru_hash_apply_by_rsrc(fmd.d_asrus, name, + fmd_asru_flush, &err); + } *rvp = err; return (TRUE); } diff --git a/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c b/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c index 32e9c6504c..ea1e49a1a2 100644 --- a/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c +++ b/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c @@ -406,8 +406,8 @@ static void cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) { char *uuid = NULL; - nvlist_t **nva; - uint_t nvc = 0; + nvlist_t **nva, **save_nva; + uint_t nvc = 0, save_nvc; uint_t keepopen; int err = 0; nvlist_t *asru = NULL; @@ -421,7 +421,8 @@ cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) return; } - keepopen = nvc; + save_nvc = keepopen = nvc; + save_nva = nva; while (nvc-- != 0 && (strcmp(class, FM_LIST_SUSPECT_CLASS) != 0 || !fmd_case_uuclosed(hdl, uuid))) { nvlist_t *nvl = *nva++; @@ -455,6 +456,24 @@ cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) } /* + * Run though again to catch any new faults in list.updated. + */ + while (save_nvc-- != 0 && (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) { + nvlist_t *nvl = *save_nva++; + const cma_subscriber_t *subr; + int has_fault; + + if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) + continue; + if (subr->subr_func != NULL) { + has_fault = fmd_nvl_fmri_has_fault(hdl, asru, + FMD_HAS_FAULT_ASRU, NULL); + if (has_fault == 1) + err = subr->subr_func(hdl, nvl, asru, uuid, 0); + } + } + + /* * Do not close the case if we are handling cache faults. */ if (asru != NULL) { diff --git a/usr/src/cmd/fm/modules/common/io-retire/rio_main.c b/usr/src/cmd/fm/modules/common/io-retire/rio_main.c index c5953a70cb..b7509311ed 100644 --- a/usr/src/cmd/fm/modules/common/io-retire/rio_main.c +++ b/usr/src/cmd/fm/modules/common/io-retire/rio_main.c @@ -139,7 +139,8 @@ rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) /* * If disabled, we don't do retire. We still do unretires though */ - if (global_disable && strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) { + if (global_disable && (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || + strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) { fmd_hdl_debug(hdl, "rio_recv: retire disabled\n"); return; } @@ -226,6 +227,51 @@ rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) } } } + /* + * Run through again to handle new faults in a list.updated. + */ + for (f = 0; f < nfaults; f++) { + if (nvlist_lookup_boolean_value(faults[f], FM_SUSPECT_RETIRE, + &rtr) == 0 && !rtr) { + fmd_hdl_debug(hdl, "rio_recv: retire suppressed"); + continue; + } + + if (nvlist_lookup_nvlist(faults[f], FM_FAULT_ASRU, + &asru) != 0) { + fmd_hdl_debug(hdl, "rio_recv: no asru in fault"); + continue; + } + + scheme = NULL; + if (nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 || + strcmp(scheme, FM_FMRI_SCHEME_DEV) != 0) { + fmd_hdl_debug(hdl, "rio_recv: not \"dev\" scheme: %s", + scheme ? scheme : "<NULL>"); + continue; + } + + if (fault_exception(hdl, faults[f])) + continue; + + if (nvlist_lookup_string(asru, FM_FMRI_DEV_PATH, + &path) != 0 || path[0] == '\0') { + fmd_hdl_debug(hdl, "rio_recv: no dev path in asru"); + continue; + } + + if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) { + if (fmd_nvl_fmri_has_fault(hdl, asru, + FMD_HAS_FAULT_ASRU, NULL) == 1) { + error = di_retire_device(path, &drt, 0); + if (error != 0) { + fmd_hdl_debug(hdl, "rio_recv:" + " di_retire_device failed:" + " error: %d %s", error, path); + } + } + } + } /* * Don't send uuclose or uuresolved unless at least one suspect diff --git a/usr/src/uts/common/sys/fm/protocol.h b/usr/src/uts/common/sys/fm/protocol.h index bcd3850816..df97ad6038 100644 --- a/usr/src/uts/common/sys/fm/protocol.h +++ b/usr/src/uts/common/sys/fm/protocol.h @@ -122,6 +122,7 @@ extern "C" { #define FM_RSRC_ASRU_REPAIRED "repaired" #define FM_RSRC_ASRU_REPLACED "replaced" #define FM_RSRC_ASRU_ACQUITTED "acquitted" +#define FM_RSRC_ASRU_RESOLVED "resolved" #define FM_RSRC_ASRU_UNUSABLE "unusable" #define FM_RSRC_ASRU_EVENT "event" |