diff options
Diffstat (limited to 'usr/src/cmd/fm/fmd/common/fmd_asru.c')
-rw-r--r-- | usr/src/cmd/fm/fmd/common/fmd_asru.c | 329 |
1 files changed, 247 insertions, 82 deletions
diff --git a/usr/src/cmd/fm/fmd/common/fmd_asru.c b/usr/src/cmd/fm/fmd/common/fmd_asru.c index ff3c6ba367..058a3ef384 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_asru.c +++ b/usr/src/cmd/fm/fmd/common/fmd_asru.c @@ -164,7 +164,7 @@ fmd_asru_hash_lookup(fmd_asru_hash_t *ahp, const char *name) } static int -fmd_asru_is_present(nvlist_t *event) +fmd_asru_replacement_state(nvlist_t *event) { int ps = -1; nvlist_t *asru, *fru, *rsrc; @@ -181,16 +181,36 @@ fmd_asru_is_present(nvlist_t *event) * as still present. */ if (fmd_asru_fake_not_present) - ps = 0; - if (ps == -1 && nvlist_lookup_nvlist(event, FM_FAULT_ASRU, &asru) == 0) - ps = fmd_fmri_present(asru); - if (ps == -1 && nvlist_lookup_nvlist(event, FM_FAULT_RESOURCE, - &rsrc) == 0) - ps = fmd_fmri_present(rsrc); - if (ps == -1 && nvlist_lookup_nvlist(event, FM_FAULT_FRU, &fru) == 0) - ps = fmd_fmri_present(fru); + return (fmd_asru_fake_not_present); + if (nvlist_lookup_nvlist(event, FM_FAULT_ASRU, &asru) == 0) + ps = fmd_fmri_replaced(asru); + if (ps == -1) { + if (nvlist_lookup_nvlist(event, FM_FAULT_RESOURCE, &rsrc) == 0) + ps = fmd_fmri_replaced(rsrc); + } else if (ps == FMD_OBJ_STATE_UNKNOWN) { + /* see if we can improve on UNKNOWN */ + if (nvlist_lookup_nvlist(event, FM_FAULT_RESOURCE, + &rsrc) == 0) { + int ps2 = fmd_fmri_replaced(rsrc); + if (ps2 == FMD_OBJ_STATE_STILL_PRESENT || + ps2 == FMD_OBJ_STATE_REPLACED) + ps = ps2; + } + } + if (ps == -1) { + if (nvlist_lookup_nvlist(event, FM_FAULT_FRU, &fru) == 0) + ps = fmd_fmri_replaced(fru); + } else if (ps == FMD_OBJ_STATE_UNKNOWN) { + /* see if we can improve on UNKNOWN */ + if (nvlist_lookup_nvlist(event, FM_FAULT_FRU, &fru) == 0) { + int ps2 = fmd_fmri_replaced(fru); + if (ps2 == FMD_OBJ_STATE_STILL_PRESENT || + ps2 == FMD_OBJ_STATE_REPLACED) + ps = ps2; + } + } if (ps == -1) - ps = 1; + ps = FMD_OBJ_STATE_UNKNOWN; return (ps); } @@ -404,7 +424,10 @@ static void fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) { nvlist_t *nvl = FMD_EVENT_NVL(ep); - boolean_t f, u, ps, us; + boolean_t faulty = FMD_B_FALSE, unusable = FMD_B_FALSE; + int ps; + boolean_t repaired = FMD_B_FALSE, replaced = FMD_B_FALSE; + boolean_t acquitted = FMD_B_FALSE; nvlist_t *flt, *flt_copy, *asru; char *case_uuid = NULL, *case_code = NULL; fmd_asru_t *ap; @@ -420,7 +443,8 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) /* * Extract the most recent values of 'faulty' from the event log. */ - if (nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_FAULTY, &f) != 0) { + if (nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_FAULTY, + &faulty) != 0) { fmd_error(EFMD_ASRU_EVENT, "failed to reload asru %s: " "invalid event log record\n", lp->log_name); ahp->ah_error = EFMD_ASRU_EVENT; @@ -434,16 +458,25 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) } (void) nvlist_lookup_string(nvl, FM_RSRC_ASRU_UUID, &case_uuid); (void) nvlist_lookup_string(nvl, FM_RSRC_ASRU_CODE, &case_code); + (void) nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_UNUSABLE, + &unusable); + (void) nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_REPAIRED, + &repaired); + (void) nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_REPLACED, + &replaced); + (void) nvlist_lookup_boolean_value(nvl, FM_RSRC_ASRU_ACQUITTED, + &acquitted); /* - * Attempt to recreate the case in the CLOSED state. + * Attempt to recreate the case in either the CLOSED or REPAIRED state + * (depending on whether the faulty bit is still set). * If the case is already present, fmd_case_recreate() will return it. * If not, we'll create a new orphaned case. Either way, we use the * ASRU event to insert a suspect into the partially-restored case. */ fmd_module_lock(fmd.d_rmod); - cp = fmd_case_recreate(fmd.d_rmod, NULL, FMD_CASE_CLOSED, case_uuid, - case_code); + cp = fmd_case_recreate(fmd.d_rmod, NULL, faulty ? FMD_CASE_CLOSED : + FMD_CASE_REPAIRED, case_uuid, case_code); fmd_case_hold(cp); fmd_module_unlock(fmd.d_rmod); if (nvlist_lookup_int64_array(nvl, FM_SUSPECT_DIAG_TIME, &diag_time, @@ -478,37 +511,31 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) ap = alp->al_asru; /* - * Check to see if the resource is still present in the system. If - * so, then update the value of the unusable bit based on the current - * system configuration. If not, then consider unusable. + * Check to see if the resource is still present in the system. */ - ps = fmd_asru_is_present(flt); - if (ps) { - if (nvlist_lookup_nvlist(flt, FM_FAULT_ASRU, &asru) != 0) - u = FMD_B_FALSE; - else if ((us = fmd_fmri_unusable(asru)) == -1) { - fmd_error(EFMD_ASRU_FMRI, "failed to update " - "status of asru %s", lp->log_name); - u = FMD_B_FALSE; - } else - u = us != 0; - - } else - u = FMD_B_TRUE; /* not present; set unusable */ + ps = fmd_asru_replacement_state(flt); + if (ps == FMD_OBJ_STATE_STILL_PRESENT || ps == FMD_OBJ_STATE_UNKNOWN) + ap->asru_flags |= FMD_ASRU_PRESENT; + else if (ps == FMD_OBJ_STATE_REPLACED) + replaced = FMD_B_TRUE; nvlist_free(flt); ap->asru_flags |= FMD_ASRU_RECREATED; - if (ps) - ap->asru_flags |= FMD_ASRU_PRESENT; - if (f) { + if (faulty) { alp->al_flags |= FMD_ASRU_FAULTY; ap->asru_flags |= FMD_ASRU_FAULTY; } - if (u) { + if (unusable) { alp->al_flags |= FMD_ASRU_UNUSABLE; ap->asru_flags |= FMD_ASRU_UNUSABLE; } + if (replaced) + alp->al_reason = FMD_ASRU_REPLACED; + else if (repaired) + alp->al_reason = FMD_ASRU_REPAIRED; + else if (acquitted) + alp->al_reason = FMD_ASRU_ACQUITTED; TRACE((FMD_DBG_ASRU, "asru %s recreated as %p (%s)", alp->al_uuid, (void *)ap, _fmd_asru_snames[ap->asru_flags & FMD_ASRU_STATE])); @@ -629,29 +656,34 @@ fmd_asru_hash_replay(fmd_asru_hash_t *ahp) * Check if the resource is still present. If not, and if the rsrc.age time * has expired, then do an implicit repair on the resource. */ +/*ARGSUSED*/ static void -fmd_asru_repair_if_aged(fmd_asru_link_t *alp, void *er) +fmd_asru_repair_if_aged(fmd_asru_link_t *alp, void *arg) { struct timeval tv; fmd_log_t *lp; hrtime_t hrt; + int ps; + int err; - if (fmd_asru_is_present(alp->al_event)) - return; - fmd_time_gettimeofday(&tv); - lp = fmd_log_open(alp->al_asru->asru_root, alp->al_uuid, FMD_LOG_ASRU); - hrt = (hrtime_t)(tv.tv_sec - lp->log_stat.st_mtime); - fmd_log_rele(lp); - if (hrt * NANOSEC >= fmd.d_asrus->ah_lifetime) - fmd_asru_repair(alp, er); + ps = fmd_asru_replacement_state(alp->al_event); + if (ps == FMD_OBJ_STATE_REPLACED) { + fmd_asru_replaced(alp, &err); + } else if (ps == FMD_OBJ_STATE_NOT_PRESENT) { + fmd_time_gettimeofday(&tv); + lp = fmd_log_open(alp->al_asru->asru_root, alp->al_uuid, + FMD_LOG_ASRU); + hrt = (hrtime_t)(tv.tv_sec - lp->log_stat.st_mtime); + fmd_log_rele(lp); + if (hrt * NANOSEC >= fmd.d_asrus->ah_lifetime) + fmd_asru_removed(alp); + } } void fmd_asru_clear_aged_rsrcs() { - int err; - - fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_repair_if_aged, &err); + fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_repair_if_aged, NULL); } fmd_asru_hash_t * @@ -881,25 +913,6 @@ fmd_asru_hash_lookup_name(fmd_asru_hash_t *ahp, const char *name) } /* - * Lookup an asru in the hash and place a hold on it. - */ -fmd_asru_t * -fmd_asru_hash_lookup_nvl(fmd_asru_hash_t *ahp, nvlist_t *fmri) -{ - fmd_asru_t *ap; - char *name = NULL; - ssize_t namelen; - - if (fmd_asru_get_namestr(fmri, &name, &namelen) != 0) - return (NULL); - (void) pthread_rwlock_rdlock(&ahp->ah_lock); - ap = fmd_asru_hash_lookup(ahp, name); - (void) pthread_rwlock_unlock(&ahp->ah_lock); - fmd_free(name, namelen + 1); - return (ap); -} - -/* * Create a resource cache entry using the fault event "nvl" for one of the * suspects from the case "cp". * @@ -1109,12 +1122,13 @@ static void fmd_asru_repair_containee(fmd_asru_link_t *alp, void *er) { if (er && alp->al_asru_fmri && fmd_fmri_contains(er, - alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY)) + alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + FMD_ASRU_REPAIRED)) fmd_case_update(alp->al_case); } void -fmd_asru_repair(fmd_asru_link_t *alp, void *er) +fmd_asru_repaired(fmd_asru_link_t *alp, void *er) { int flags; int rval; @@ -1122,7 +1136,7 @@ fmd_asru_repair(fmd_asru_link_t *alp, void *er) /* * repair this asru cache entry */ - rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY); + rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, FMD_ASRU_REPAIRED); /* * now check if all entries associated with this asru are repaired and @@ -1149,12 +1163,134 @@ fmd_asru_repair(fmd_asru_link_t *alp, void *er) } static void +fmd_asru_acquit_containee(fmd_asru_link_t *alp, void *er) +{ + if (er && alp->al_asru_fmri && fmd_fmri_contains(er, + alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + FMD_ASRU_ACQUITTED)) + fmd_case_update(alp->al_case); +} + +void +fmd_asru_acquit(fmd_asru_link_t *alp, void *er) +{ + int flags; + int rval; + + /* + * acquit this asru cache entry + */ + rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, FMD_ASRU_ACQUITTED); + + /* + * now check if all entries associated with this asru are acquitted and + * if so acquit containees + */ + (void) pthread_mutex_lock(&alp->al_asru->asru_lock); + flags = alp->al_asru->asru_flags; + (void) pthread_mutex_unlock(&alp->al_asru->asru_lock); + if (!(flags & FMD_ASRU_FAULTY)) + fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_acquit_containee, + alp->al_asru_fmri); + + /* + * if called from fmd_adm_acquit() and we really did clear the bit then + * we need to do a case update to see if the associated case can be + * repaired. No need to do this if called from fmd_case_acquit() (ie + * when er is NULL) as the case will be explicitly repaired anyway. + */ + if (er) { + *(int *)er = 0; + if (rval) + fmd_case_update(alp->al_case); + } +} + +static void +fmd_asru_replaced_containee(fmd_asru_link_t *alp, void *er) +{ + if (er && alp->al_asru_fmri && fmd_fmri_contains(er, + alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + FMD_ASRU_REPLACED)) + fmd_case_update(alp->al_case); +} + +void +fmd_asru_replaced(fmd_asru_link_t *alp, void *er) +{ + int flags; + int rval; + int ps; + + ps = fmd_asru_replacement_state(alp->al_event); + if (ps == FMD_OBJ_STATE_STILL_PRESENT) + return; + + /* + * mark this cache entry as replaced + */ + rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, FMD_ASRU_REPLACED); + + /* + * now check if all entries associated with this asru are replaced and + * if so replace containees + */ + (void) pthread_mutex_lock(&alp->al_asru->asru_lock); + flags = alp->al_asru->asru_flags; + (void) pthread_mutex_unlock(&alp->al_asru->asru_lock); + if (!(flags & FMD_ASRU_FAULTY)) + fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_replaced_containee, + alp->al_asru_fmri); + + *(int *)er = 0; + if (rval) + fmd_case_update(alp->al_case); +} + +static void +fmd_asru_removed_containee(fmd_asru_link_t *alp, void *er) +{ + if (er && alp->al_asru_fmri && fmd_fmri_contains(er, + alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + 0)) + fmd_case_update(alp->al_case); +} + +void +fmd_asru_removed(fmd_asru_link_t *alp) +{ + int flags; + int rval; + + /* + * mark this cache entry as replacded + */ + rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 0); + + /* + * now check if all entries associated with this asru are removed and + * if so replace containees + */ + (void) pthread_mutex_lock(&alp->al_asru->asru_lock); + flags = alp->al_asru->asru_flags; + (void) pthread_mutex_unlock(&alp->al_asru->asru_lock); + if (!(flags & FMD_ASRU_FAULTY)) + fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_removed_containee, + alp->al_asru_fmri); + if (rval) + fmd_case_update(alp->al_case); +} + +static void fmd_asru_logevent(fmd_asru_link_t *alp) { fmd_asru_t *ap = alp->al_asru; - boolean_t f = (ap->asru_flags & FMD_ASRU_FAULTY) != 0; - boolean_t u = (ap->asru_flags & FMD_ASRU_UNUSABLE) != 0; - boolean_t m = (ap->asru_flags & FMD_ASRU_INVISIBLE) == 0; + boolean_t faulty = (alp->al_flags & FMD_ASRU_FAULTY) != 0; + boolean_t unusable = (alp->al_flags & FMD_ASRU_UNUSABLE) != 0; + boolean_t message = (ap->asru_flags & FMD_ASRU_INVISIBLE) == 0; + boolean_t repaired = (alp->al_reason == FMD_ASRU_REPAIRED); + boolean_t replaced = (alp->al_reason == FMD_ASRU_REPLACED); + boolean_t acquitted = (alp->al_reason == FMD_ASRU_ACQUITTED); fmd_case_impl_t *cip; fmd_event_t *e; @@ -1172,9 +1308,9 @@ fmd_asru_logevent(fmd_asru_link_t *alp) if (lp == NULL) return; /* can't log events if we can't open the log */ - nvl = fmd_protocol_rsrc_asru(_fmd_asru_events[f | (u << 1)], - alp->al_asru_fmri, cip->ci_uuid, cip->ci_code, f, u, m, - alp->al_event, &cip->ci_tv); + nvl = fmd_protocol_rsrc_asru(_fmd_asru_events[faulty | (unusable << 1)], + alp->al_asru_fmri, cip->ci_uuid, cip->ci_code, faulty, unusable, + message, alp->al_event, &cip->ci_tv, repaired, replaced, acquitted); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); @@ -1224,7 +1360,7 @@ fmd_asru_setflags(fmd_asru_link_t *alp, uint_t sflag) } int -fmd_asru_clrflags(fmd_asru_link_t *alp, uint_t sflag) +fmd_asru_clrflags(fmd_asru_link_t *alp, uint_t sflag, uint8_t reason) { fmd_asru_t *ap = alp->al_asru; fmd_asru_link_t *nalp; @@ -1240,9 +1376,16 @@ fmd_asru_clrflags(fmd_asru_link_t *alp, uint_t sflag) nstate = alp->al_flags & FMD_ASRU_STATE; if (nstate == ostate) { + if (reason > alp->al_reason) { + alp->al_reason = reason; + fmd_asru_logevent(alp); + (void) pthread_cond_broadcast(&ap->asru_cv); + } (void) pthread_mutex_unlock(&ap->asru_lock); return (0); } + if (reason > alp->al_reason) + alp->al_reason = reason; if (sflag == FMD_ASRU_UNUSABLE) ap->asru_flags &= ~sflag; @@ -1277,15 +1420,36 @@ fmd_asru_al_getstate(fmd_asru_link_t *alp) { int us, st; nvlist_t *asru; + int ps; - if (fmd_asru_is_present(alp->al_event) == 0) + ps = fmd_asru_replacement_state(alp->al_event); + if (ps == FMD_OBJ_STATE_NOT_PRESENT) return ((alp->al_flags & FMD_ASRU_FAULTY) | FMD_ASRU_UNUSABLE); + if (ps == FMD_OBJ_STATE_REPLACED) { + if (alp->al_reason < FMD_ASRU_REPLACED) + alp->al_reason = FMD_ASRU_REPLACED; + return ((alp->al_flags & FMD_ASRU_FAULTY) | FMD_ASRU_UNUSABLE); + } - if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) == 0) - us = fmd_fmri_unusable(asru); - else - us = (alp->al_flags & FMD_ASRU_UNUSABLE); st = (alp->al_flags & FMD_ASRU_STATE) | FMD_ASRU_PRESENT; + if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) == 0) { + us = fmd_fmri_service_state(asru); + if (us == -1 || us == FMD_SERVICE_STATE_UNKNOWN) { + /* not supported by scheme - try fmd_fmri_unusable */ + us = fmd_fmri_unusable(asru); + } else if (us == FMD_SERVICE_STATE_UNUSABLE) { + st |= FMD_ASRU_UNUSABLE; + return (st); + } else if (us == FMD_SERVICE_STATE_OK) { + st &= ~FMD_ASRU_UNUSABLE; + return (st); + } else if (us == FMD_SERVICE_STATE_DEGRADED) { + st &= ~FMD_ASRU_UNUSABLE; + st |= FMD_ASRU_DEGRADED; + return (st); + } + } else + us = (alp->al_flags & FMD_ASRU_UNUSABLE); if (us > 0) st |= FMD_ASRU_UNUSABLE; else if (us == 0) @@ -1307,7 +1471,8 @@ fmd_asru_getstate(fmd_asru_t *ap) int us, st; if (!(ap->asru_flags & FMD_ASRU_INTERNAL) && - (fmd_asru_fake_not_present || fmd_fmri_present(ap->asru_fmri) <= 0)) + (fmd_asru_fake_not_present >= FMD_OBJ_STATE_REPLACED || + fmd_fmri_present(ap->asru_fmri) <= 0)) return (0); /* do not report non-fmd non-present resources */ us = fmd_fmri_unusable(ap->asru_fmri); |