diff options
author | Stephen Hanson <Stephen.Hanson@Sun.COM> | 2009-03-20 02:22:05 -0700 |
---|---|---|
committer | Stephen Hanson <Stephen.Hanson@Sun.COM> | 2009-03-20 02:22:05 -0700 |
commit | cbf75e67acb6c32a2f4884f28a839d59f7988d37 (patch) | |
tree | 797d512a8c6a0afbe2083be228bb43cce9be5c93 | |
parent | aa4b59d395817702a402da9bd7a40537fcbff526 (diff) | |
download | illumos-gate-cbf75e67acb6c32a2f4884f28a839d59f7988d37.tar.gz |
6533823 need better way of proxying faults across event transport
6788551 provide means for faults/defects to be directly injected into fmd
25 files changed, 1640 insertions, 419 deletions
diff --git a/usr/src/cmd/fm/fmadm/common/faulty.c b/usr/src/cmd/fm/fmadm/common/faulty.c index f89feb2ab4..378f724153 100644 --- a/usr/src/cmd/fm/fmadm/common/faulty.c +++ b/usr/src/cmd/fm/fmadm/common/faulty.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -193,6 +193,7 @@ typedef struct host_id { char *chassis; char *server; char *platform; + char *domain; } hostid_t; typedef struct host_id_list { @@ -265,7 +266,7 @@ format_date(char *buf, size_t len, uint64_t sec) } static hostid_t * -find_hostid_in_list(char *platform, char *chassis, char *server) +find_hostid_in_list(char *platform, char *chassis, char *server, char *domain) { hostid_t *rt = NULL; host_id_list_t *hostp; @@ -281,7 +282,9 @@ find_hostid_in_list(char *platform, char *chassis, char *server) hostp->hostid.server && strcmp(hostp->hostid.server, server) == 0 && (chassis == NULL || hostp->hostid.chassis == NULL || - strcmp(chassis, hostp->hostid.chassis) == 0)) { + strcmp(chassis, hostp->hostid.chassis) == 0) && + (domain == NULL || hostp->hostid.domain == NULL || + strcmp(domain, hostp->hostid.domain) == 0)) { rt = &hostp->hostid; break; } @@ -292,6 +295,7 @@ find_hostid_in_list(char *platform, char *chassis, char *server) hostp->hostid.platform = strdup(platform); hostp->hostid.server = strdup(server); hostp->hostid.chassis = chassis ? strdup(chassis) : NULL; + hostp->hostid.domain = domain ? strdup(domain) : NULL; hostp->next = host_list; host_list = hostp; rt = &hostp->hostid; @@ -303,7 +307,7 @@ find_hostid_in_list(char *platform, char *chassis, char *server) static hostid_t * find_hostid(nvlist_t *nvl) { - char *platform = NULL, *chassis = NULL, *server = NULL; + char *platform = NULL, *chassis = NULL, *server = NULL, *domain = NULL; nvlist_t *auth, *fmri; hostid_t *rt = NULL; @@ -314,7 +318,8 @@ find_hostid(nvlist_t *nvl) (void) nvlist_lookup_string(auth, FM_FMRI_AUTH_SERVER, &server); (void) nvlist_lookup_string(auth, FM_FMRI_AUTH_CHASSIS, &chassis); - rt = find_hostid_in_list(platform, chassis, server); + (void) nvlist_lookup_string(auth, FM_FMRI_AUTH_DOMAIN, &domain); + rt = find_hostid_in_list(platform, chassis, server, domain); } return (rt); } @@ -1367,6 +1372,33 @@ print_fru_status(int status, char *label) } static void +print_rsrc_status(int status, char *label) +{ + char *msg = ""; + + if (status & FM_SUSPECT_NOT_PRESENT) + msg = dgettext("FMD", "not present"); + else if (status & FM_SUSPECT_FAULTY) { + if (status & FM_SUSPECT_DEGRADED) + msg = dgettext("FMD", + "faulted but still providing degraded service"); + else if (status & FM_SUSPECT_UNUSABLE) + msg = dgettext("FMD", + "faulted and taken out of service"); + else + msg = dgettext("FMD", "faulted but still in service"); + } else if (status & FM_SUSPECT_REPLACED) + msg = dgettext("FMD", "replaced"); + else if (status & FM_SUSPECT_REPAIRED) + msg = dgettext("FMD", "repair attempted"); + else if (status & FM_SUSPECT_ACQUITTED) + msg = dgettext("FMD", "acquitted"); + else + msg = dgettext("FMD", "removed"); + (void) printf("%s %s\n", label, msg); +} + +static void print_name_list(name_list_t *list, char *label, char *(func)(char *), int limit, int pct, void (func1)(int, char *), int full) { @@ -1476,13 +1508,6 @@ serial_in_fru(name_list_t *fru, name_list_t *serial) } static void -print_server_name(hostid_t *host, char *label) -{ - (void) printf("%s %s %s %s\n", label, host->server, host->platform, - host->chassis ? host->chassis : ""); -} - -static void print_sup_record(status_record_t *srp, int opt_i, int full) { char buf[32]; @@ -1521,8 +1546,15 @@ print_sup_record(status_record_t *srp, int opt_i, int full) n++; } (void) printf("\n"); - if (n_server > 1) - print_server_name(srp->host, dgettext("FMD", "Host :")); + (void) printf("%s %s", dgettext("FMD", "Host :"), + srp->host->server); + if (srp->host->domain) + (void) printf("\t%s %s", dgettext("FMD", "Domain :"), + srp->host->domain); + (void) printf("\n%s %s", dgettext("FMD", "Platform :"), + srp->host->platform); + (void) printf("\t%s %s\n\n", dgettext("FMD", "Chassis_id :"), + srp->host->chassis ? srp->host->chassis : ""); if (srp->class) print_name_list(srp->class, dgettext("FMD", "Fault class :"), NULL, 0, srp->class->pct, @@ -1539,11 +1571,11 @@ print_sup_record(status_record_t *srp, int opt_i, int full) dgettext("FMD", "Affects :"), NULL, full ? 0 : max_display, 0, print_asru_status, full); } - if (full || srp->fru == NULL) { + if (full || srp->fru == NULL || srp->asru == NULL) { if (srp->resource) { print_name_list(srp->resource, dgettext("FMD", "Problem in :"), - NULL, full ? 0 : max_display, 0, print_fru_status, + NULL, full ? 0 : max_display, 0, print_rsrc_status, full); } } diff --git a/usr/src/cmd/fm/fmd/common/fmd.c b/usr/src/cmd/fm/fmd/common/fmd.c index 903ad1d9b5..d1569d5f70 100644 --- a/usr/src/cmd/fm/fmd/common/fmd.c +++ b/usr/src/cmd/fm/fmd/common/fmd.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -584,7 +584,7 @@ fmd_destroy(fmd_t *dp) fmd_module_lock(dp->d_rmod); while ((cp = fmd_list_next(&dp->d_rmod->mod_cases)) != NULL) - fmd_case_discard(cp); + fmd_case_discard(cp, B_FALSE); fmd_module_unlock(dp->d_rmod); fmd_free(dp->d_rmod->mod_stats, sizeof (fmd_modstat_t)); diff --git a/usr/src/cmd/fm/fmd/common/fmd_api.c b/usr/src/cmd/fm/fmd/common/fmd_api.c index 85323d3281..bfad22b0db 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_api.c +++ b/usr/src/cmd/fm/fmd/common/fmd_api.c @@ -1142,7 +1142,15 @@ fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid) fmd_case_t *cp = fmd_case_hash_lookup(fmd.d_cases, uuid); if (cp != NULL) { - fmd_case_transition(cp, FMD_CASE_RESOLVED, 0); + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + /* + * For a proxy, we notify the diagnosing side, and then + * wait for it to send us back a list.resolved. + */ + if (cip->ci_xprt != NULL) + fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid); + else + fmd_case_transition(cp, FMD_CASE_RESOLVED, 0); fmd_case_rele(cp); } @@ -2460,6 +2468,44 @@ fmd_xprt_translate(fmd_hdl_t *hdl, fmd_xprt_t *xp, fmd_event_t *ep) return (fmd_xprt_xtranslate(FMD_EVENT_NVL(ep), xip->xi_auth)); } +/*ARGSUSED*/ +void +fmd_xprt_add_domain(fmd_hdl_t *hdl, nvlist_t *nvl, char *domain) +{ + nvpair_t *nvp, *nvp2; + nvlist_t *nvl2, *nvl3; + char *class; + + if (nvl == NULL || domain == NULL) + return; + for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(nvl, nvp)) { + if (strcmp(nvpair_name(nvp), FM_CLASS) == 0) { + (void) nvpair_value_string(nvp, &class); + if (strcmp(class, FM_LIST_SUSPECT_CLASS) != 0) + return; + } + } + for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(nvl, nvp)) { + if (strcmp(nvpair_name(nvp), FM_SUSPECT_DE) == 0) { + (void) nvpair_value_nvlist(nvp, &nvl2); + for (nvp2 = nvlist_next_nvpair(nvl2, NULL); + nvp2 != NULL; + nvp2 = nvlist_next_nvpair(nvl2, nvp2)) { + if (strcmp(nvpair_name(nvp2), + FM_FMRI_AUTHORITY) == 0) { + (void) nvpair_value_nvlist(nvp2, &nvl3); + (void) nvlist_add_string(nvl3, + FM_FMRI_AUTH_DOMAIN, domain); + break; + } + } + break; + } + } +} + void fmd_xprt_setspecific(fmd_hdl_t *hdl, fmd_xprt_t *xp, void *data) { diff --git a/usr/src/cmd/fm/fmd/common/fmd_api.h b/usr/src/cmd/fm/fmd/common/fmd_api.h index 57d2ef2ef6..f6649875c4 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_api.h +++ b/usr/src/cmd/fm/fmd/common/fmd_api.h @@ -251,6 +251,11 @@ extern uint64_t fmd_event_ena_create(fmd_hdl_t *); #define FMD_XPRT_RDWR 0x3 /* transport is read-write */ #define FMD_XPRT_ACCEPT 0x4 /* transport is accepting connection */ #define FMD_XPRT_SUSPENDED 0x8 /* transport starts suspended */ +#define FMD_XPRT_EXTERNAL 0x80 /* xprt is external to a chassis */ +#define FMD_XPRT_NO_REMOTE_REPAIR 0x100 /* xprt does not allow remote repair */ +#define FMD_XPRT_CACHE_AS_LOCAL 0x200 /* xprt caches fault as if local */ +#define FMD_XPRT_HCONLY 0x400 /* xprt only proxies hc-scheme faults */ +#define FMD_XPRT_HC_PRESENT_ONLY 0x800 /* only locally present hc faults */ extern fmd_xprt_t *fmd_xprt_open(fmd_hdl_t *, uint_t, nvlist_t *, void *); extern void fmd_xprt_close(fmd_hdl_t *, fmd_xprt_t *); @@ -260,6 +265,7 @@ extern void fmd_xprt_suspend(fmd_hdl_t *, fmd_xprt_t *); extern void fmd_xprt_resume(fmd_hdl_t *, fmd_xprt_t *); extern int fmd_xprt_error(fmd_hdl_t *, fmd_xprt_t *); extern nvlist_t *fmd_xprt_translate(fmd_hdl_t *, fmd_xprt_t *, fmd_event_t *); +extern void fmd_xprt_add_domain(fmd_hdl_t *, nvlist_t *, char *); extern void fmd_xprt_setspecific(fmd_hdl_t *, fmd_xprt_t *, void *); extern void *fmd_xprt_getspecific(fmd_hdl_t *, fmd_xprt_t *); diff --git a/usr/src/cmd/fm/fmd/common/fmd_api.map b/usr/src/cmd/fm/fmd/common/fmd_api.map index 791a3e6593..1ac97bd09b 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_api.map +++ b/usr/src/cmd/fm/fmd/common/fmd_api.map @@ -126,4 +126,5 @@ fmd_xprt_setspecific = FUNCTION extern; fmd_xprt_suspend = FUNCTION extern; fmd_xprt_translate = FUNCTION extern; + fmd_xprt_add_domain = FUNCTION extern; }; diff --git a/usr/src/cmd/fm/fmd/common/fmd_asru.c b/usr/src/cmd/fm/fmd/common/fmd_asru.c index 5ed622330a..04b97efe6e 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_asru.c +++ b/usr/src/cmd/fm/fmd/common/fmd_asru.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -161,11 +161,15 @@ fmd_asru_hash_lookup(fmd_asru_hash_t *ahp, const char *name) return (ap); } +#define HC_ONLY_FALSE 0 +#define HC_ONLY_TRUE 1 + static int -fmd_asru_replacement_state(nvlist_t *event) +fmd_asru_replacement_state(nvlist_t *event, int hc_only) { int ps = -1; nvlist_t *asru, *fru, *rsrc; + char *s; /* * Check if there is evidence that this object is no longer present. @@ -177,34 +181,46 @@ fmd_asru_replacement_state(nvlist_t *event) * If we have checked all three and we still get -1 then nothing knows * whether it's present or not, so err on the safe side and treat it * as still present. + * + * Note that if hc_only is set, then we only check status using fmris + * that are in hc-scheme. */ if (fmd_asru_fake_not_present) return (fmd_asru_fake_not_present); - if (nvlist_lookup_nvlist(event, FM_FAULT_ASRU, &asru) == 0) + if (nvlist_lookup_nvlist(event, FM_FAULT_ASRU, &asru) == 0 && + (hc_only == HC_ONLY_FALSE || (nvlist_lookup_string(asru, + FM_FMRI_SCHEME, &s) == 0 && strcmp(s, FM_FMRI_SCHEME_HC) == 0))) ps = fmd_fmri_replaced(asru); - if (ps == -1) { - if (nvlist_lookup_nvlist(event, FM_FAULT_RESOURCE, &rsrc) == 0) - ps = fmd_fmri_replaced(rsrc); - } else if (ps == FMD_OBJ_STATE_UNKNOWN) { - /* see if we can improve on UNKNOWN */ + if (ps == -1 || ps == FMD_OBJ_STATE_UNKNOWN) { if (nvlist_lookup_nvlist(event, FM_FAULT_RESOURCE, - &rsrc) == 0) { - int ps2 = fmd_fmri_replaced(rsrc); - if (ps2 == FMD_OBJ_STATE_STILL_PRESENT || - ps2 == FMD_OBJ_STATE_REPLACED) - ps = ps2; + &rsrc) == 0 && (hc_only == HC_ONLY_FALSE || + (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &s) == 0 && + strcmp(s, FM_FMRI_SCHEME_HC) == 0))) { + if (ps == -1) { + ps = fmd_fmri_replaced(rsrc); + } else { + /* see if we can improve on UNKNOWN */ + int ps2 = fmd_fmri_replaced(rsrc); + if (ps2 == FMD_OBJ_STATE_STILL_PRESENT || + ps2 == FMD_OBJ_STATE_REPLACED) + ps = ps2; + } } } - if (ps == -1) { - if (nvlist_lookup_nvlist(event, FM_FAULT_FRU, &fru) == 0) - ps = fmd_fmri_replaced(fru); - } else if (ps == FMD_OBJ_STATE_UNKNOWN) { - /* see if we can improve on UNKNOWN */ - if (nvlist_lookup_nvlist(event, FM_FAULT_FRU, &fru) == 0) { - int ps2 = fmd_fmri_replaced(fru); - if (ps2 == FMD_OBJ_STATE_STILL_PRESENT || - ps2 == FMD_OBJ_STATE_REPLACED) - ps = ps2; + if (ps == -1 || ps == FMD_OBJ_STATE_UNKNOWN) { + if (nvlist_lookup_nvlist(event, FM_FAULT_FRU, &fru) == 0 && + (hc_only == HC_ONLY_FALSE || (nvlist_lookup_string(fru, + FM_FMRI_SCHEME, &s) == 0 && + strcmp(s, FM_FMRI_SCHEME_HC) == 0))) { + if (ps == -1) { + ps = fmd_fmri_replaced(fru); + } else { + /* see if we can improve on UNKNOWN */ + int ps2 = fmd_fmri_replaced(fru); + if (ps2 == FMD_OBJ_STATE_STILL_PRESENT || + ps2 == FMD_OBJ_STATE_REPLACED) + ps = ps2; + } } } if (ps == -1) @@ -432,6 +448,7 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) fmd_asru_link_t *alp; fmd_case_t *cp; int64_t *diag_time; + nvlist_t *de_fmri, *de_fmri_dup; uint_t nelem; topo_hdl_t *thp; char *class; @@ -482,6 +499,10 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) fmd_case_settime(cp, diag_time[0], diag_time[1]); else fmd_case_settime(cp, lp->log_stat.st_ctime, 0); + if (nvlist_lookup_nvlist(nvl, FM_SUSPECT_DE, &de_fmri) == 0) { + (void) nvlist_xdup(de_fmri, &de_fmri_dup, &fmd.d_nva); + fmd_case_set_de_fmri(cp, de_fmri_dup); + } (void) nvlist_xdup(flt, &flt_copy, &fmd.d_nva); /* @@ -511,7 +532,7 @@ fmd_asru_hash_recreate(fmd_log_t *lp, fmd_event_t *ep, fmd_asru_hash_t *ahp) /* * Check to see if the resource is still present in the system. */ - ps = fmd_asru_replacement_state(flt); + ps = fmd_asru_replacement_state(flt, HC_ONLY_FALSE); if (ps == FMD_OBJ_STATE_REPLACED) { replaced = FMD_B_TRUE; } else if (ps == FMD_OBJ_STATE_STILL_PRESENT || @@ -689,18 +710,33 @@ fmd_asru_repair_if_aged(fmd_asru_link_t *alp, void *arg) hrtime_t hrt; int ps; int err; + fmd_asru_rep_arg_t fara; + + /* + * Checking for aged resources only happens on the diagnosing side + * not on a proxy. + */ + if (alp->al_flags & FMD_ASRU_PROXY) + return; - ps = fmd_asru_replacement_state(alp->al_event); + ps = fmd_asru_replacement_state(alp->al_event, HC_ONLY_FALSE); if (ps == FMD_OBJ_STATE_REPLACED) { - fmd_asru_replaced(alp, &err); + fara.fara_reason = FMD_ASRU_REPLACED; + fara.fara_bywhat = FARA_ALL; + fara.fara_rval = &err; + fmd_asru_repaired(alp, &fara); } else if (ps == FMD_OBJ_STATE_NOT_PRESENT) { fmd_time_gettimeofday(&tv); lp = fmd_log_open(alp->al_asru->asru_root, alp->al_uuid, FMD_LOG_ASRU); hrt = (hrtime_t)(tv.tv_sec - lp->log_stat.st_mtime); fmd_log_rele(lp); - if (hrt * NANOSEC >= fmd.d_asrus->ah_lifetime) - fmd_asru_removed(alp); + if (hrt * NANOSEC >= fmd.d_asrus->ah_lifetime) { + fara.fara_reason = FMD_ASRU_REMOVED; + fara.fara_bywhat = FARA_ALL; + fara.fara_rval = &err; + fmd_asru_repaired(alp, &fara); + } } } @@ -1103,7 +1139,7 @@ fmd_asru_hash_delete_case(fmd_asru_hash_t *ahp, fmd_case_t *cp) */ (void) snprintf(path, sizeof (path), "%s/%s", ahp->ah_dirpath, alp->al_uuid); - if (unlink(path) != 0) + if (cip->ci_xprt == NULL && unlink(path) != 0) fmd_error(EFMD_ASRU_UNLINK, "failed to unlink asru %s", path); @@ -1142,171 +1178,237 @@ fmd_asru_hash_delete_case(fmd_asru_hash_t *ahp, fmd_case_t *cp) (void) pthread_rwlock_unlock(&ahp->ah_lock); } +typedef struct { + nvlist_t *farc_parent_fmri; + uint8_t farc_reason; +} fmd_asru_farc_t; + static void -fmd_asru_repair_containee(fmd_asru_link_t *alp, void *er) +fmd_asru_repair_containee(fmd_asru_link_t *alp, void *arg) { - if (er && (alp->al_asru->asru_flags & FMD_ASRU_INVISIBLE) && - alp->al_asru_fmri && fmd_fmri_contains(er, - alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, - FMD_ASRU_REPAIRED)) - fmd_case_update(alp->al_case); + fmd_asru_farc_t *farcp = (fmd_asru_farc_t *)arg; + + if ((alp->al_asru->asru_flags & FMD_ASRU_INVISIBLE) && + alp->al_asru_fmri && + fmd_fmri_contains(farcp->farc_parent_fmri, alp->al_asru_fmri) > 0) { + if (fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + farcp->farc_reason)) { + if (alp->al_flags & FMD_ASRU_PROXY) + fmd_case_xprt_updated(alp->al_case); + else + fmd_case_update(alp->al_case); + } + } } -void -fmd_asru_repaired(fmd_asru_link_t *alp, void *er) +static void +fmd_asru_do_repair_containees(fmd_asru_link_t *alp, uint8_t reason) { int flags; - int rval; - - /* - * repair this asru cache entry - */ - rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, FMD_ASRU_REPAIRED); /* - * now check if all entries associated with this asru are repaired and - * if so repair containees + * Check if all entries associated with this asru are acquitted and + * if so acquit containees. Don't try to repair containees on proxy + * side unless we have local asru. */ - (void) pthread_mutex_lock(&alp->al_asru->asru_lock); - flags = alp->al_asru->asru_flags; - (void) pthread_mutex_unlock(&alp->al_asru->asru_lock); - if (!(flags & (FMD_ASRU_FAULTY | FMD_ASRU_INVISIBLE))) - fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_repair_containee, - alp->al_asru_fmri); - - /* - * if called from fmd_adm_repair() and we really did clear the bit then - * we need to do a case update to see if the associated case can be - * repaired. No need to do this if called from fmd_case_repair() (ie - * when er is NULL) as the case will be explicitly repaired anyway. - */ - if (er) { - *(int *)er = 0; - if (rval) - fmd_case_update(alp->al_case); + if (alp->al_asru_fmri != NULL && (!(alp->al_flags & FMD_ASRU_PROXY) || + (alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU))) { + (void) pthread_mutex_lock(&alp->al_asru->asru_lock); + flags = alp->al_asru->asru_flags; + (void) pthread_mutex_unlock(&alp->al_asru->asru_lock); + if (!(flags & (FMD_ASRU_FAULTY | FMD_ASRU_INVISIBLE))) { + fmd_asru_farc_t farc; + + farc.farc_parent_fmri = alp->al_asru_fmri; + farc.farc_reason = reason; + fmd_asru_al_hash_apply(fmd.d_asrus, + fmd_asru_repair_containee, &farc); + } } } -static void -fmd_asru_acquit_containee(fmd_asru_link_t *alp, void *er) -{ - if (er && (alp->al_asru->asru_flags & FMD_ASRU_INVISIBLE) && - alp->al_asru_fmri && fmd_fmri_contains(er, - alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, - FMD_ASRU_ACQUITTED)) - fmd_case_update(alp->al_case); -} - void -fmd_asru_acquit(fmd_asru_link_t *alp, void *er) +fmd_asru_repaired(fmd_asru_link_t *alp, void *arg) { - int flags; - int rval; + int cleared; + fmd_asru_rep_arg_t *farap = (fmd_asru_rep_arg_t *)arg; /* - * acquit this asru cache entry + * don't allow remote repair over readonly transport */ - rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, FMD_ASRU_ACQUITTED); + if (alp->al_flags & FMD_ASRU_PROXY_RDONLY) + return; /* - * now check if all entries associated with this asru are acquitted and - * if so acquit containees + * don't allow repair etc by asru on proxy unless asru is local */ - (void) pthread_mutex_lock(&alp->al_asru->asru_lock); - flags = alp->al_asru->asru_flags; - (void) pthread_mutex_unlock(&alp->al_asru->asru_lock); - if (!(flags & (FMD_ASRU_FAULTY | FMD_ASRU_INVISIBLE))) - fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_acquit_containee, - alp->al_asru_fmri); + if (farap->fara_bywhat == FARA_BY_ASRU && + (alp->al_flags & FMD_ASRU_PROXY) && + !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) + return; + /* + * For acquit, need to check both name and uuid if specified + */ + if (farap->fara_reason == FMD_ASRU_ACQUITTED && + farap->fara_rval != NULL && strcmp(farap->fara_uuid, "") != 0 && + strcmp(farap->fara_uuid, alp->al_case_uuid) != 0) + return; /* - * if called from fmd_adm_acquit() and we really did clear the bit then - * we need to do a case update to see if the associated case can be - * repaired. No need to do this if called from fmd_case_acquit() (ie - * when er is NULL) as the case will be explicitly repaired anyway. + * For replaced, verify it has been replaced if we have serial number */ - if (er) { - *(int *)er = 0; - if (rval) - fmd_case_update(alp->al_case); + if (farap->fara_reason == FMD_ASRU_REPLACED && + !(alp->al_flags & FMD_ASRU_PROXY_EXTERNAL) && + fmd_asru_replacement_state(alp->al_event, + (alp->al_flags & FMD_ASRU_PROXY) ? HC_ONLY_TRUE : HC_ONLY_FALSE) == + FMD_OBJ_STATE_STILL_PRESENT) { + return; } -} -static void -fmd_asru_replaced_containee(fmd_asru_link_t *alp, void *er) -{ - if (er && (alp->al_asru->asru_flags & FMD_ASRU_INVISIBLE) && - alp->al_asru_fmri && fmd_fmri_contains(er, - alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, - FMD_ASRU_REPLACED)) - fmd_case_update(alp->al_case); + cleared = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, farap->fara_reason); + fmd_asru_do_repair_containees(alp, farap->fara_reason); + + /* + * if called from fmd_adm_*() and we really did clear the bit then + * we need to do a case update to see if the associated case can be + * repaired. No need to do this if called from fmd_case_*() (ie + * when arg is NULL) as the case will be explicitly repaired anyway. + */ + if (farap->fara_rval) { + *farap->fara_rval = 0; + if (cleared) { + if (alp->al_flags & FMD_ASRU_PROXY) + fmd_case_xprt_updated(alp->al_case); + else + fmd_case_update(alp->al_case); + } + } } +/* + * This is only called for proxied faults. Set various flags so we can + * find the nature of the transport from the resource cache code. + */ +/*ARGSUSED*/ void -fmd_asru_replaced(fmd_asru_link_t *alp, void *er) +fmd_asru_set_on_proxy(fmd_asru_link_t *alp, void *arg) { - int flags; - int rval; - int ps; + fmd_asru_set_on_proxy_t *entryp = (fmd_asru_set_on_proxy_t *)arg; - ps = fmd_asru_replacement_state(alp->al_event); - if (ps == FMD_OBJ_STATE_STILL_PRESENT) + if (*entryp->fasp_countp >= entryp->fasp_maxcount) return; /* - * mark this cache entry as replaced + * Note that this is a proxy fault and save whetehr transport is + * RDONLY or EXTERNAL. */ - rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, FMD_ASRU_REPLACED); + alp->al_flags |= FMD_ASRU_PROXY; + alp->al_asru->asru_flags |= FMD_ASRU_PROXY; + + if (entryp->fasp_proxy_external) { + alp->al_flags |= FMD_ASRU_PROXY_EXTERNAL; + alp->al_asru->asru_flags |= FMD_ASRU_PROXY_EXTERNAL; + } + + if (entryp->fasp_proxy_rdonly) + alp->al_flags |= FMD_ASRU_PROXY_RDONLY; /* - * now check if all entries associated with this asru are replaced and - * if so replace containees + * Save whether asru is accessible in local domain */ - (void) pthread_mutex_lock(&alp->al_asru->asru_lock); - flags = alp->al_asru->asru_flags; - (void) pthread_mutex_unlock(&alp->al_asru->asru_lock); - if (!(flags & (FMD_ASRU_FAULTY | FMD_ASRU_INVISIBLE))) - fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_replaced_containee, - alp->al_asru_fmri); - - *(int *)er = 0; - if (rval) - fmd_case_update(alp->al_case); + if (entryp->fasp_proxy_asru[*entryp->fasp_countp]) { + alp->al_flags |= FMD_ASRU_PROXY_WITH_ASRU; + alp->al_asru->asru_flags |= FMD_ASRU_PROXY_WITH_ASRU; + } + (*entryp->fasp_countp)++; } -static void -fmd_asru_removed_containee(fmd_asru_link_t *alp, void *er) +/*ARGSUSED*/ +void +fmd_asru_update_containees(fmd_asru_link_t *alp, void *arg) { - if (er && (alp->al_asru->asru_flags & FMD_ASRU_INVISIBLE) && - alp->al_asru_fmri && fmd_fmri_contains(er, - alp->al_asru_fmri) > 0 && fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, - 0)) - fmd_case_update(alp->al_case); + fmd_asru_do_repair_containees(alp, alp->al_reason); } +/* + * This function is used for fault proxying. It updates the resource status in + * the resource cache based on information that has come from the other side of + * the transport. This can be called on either the proxy side or the + * diagnosing side. + */ void -fmd_asru_removed(fmd_asru_link_t *alp) +fmd_asru_update_status(fmd_asru_link_t *alp, void *arg) { - int flags; - int rval; + fmd_asru_update_status_t *entryp = (fmd_asru_update_status_t *)arg; + uint8_t status; + + if (*entryp->faus_countp >= entryp->faus_maxcount) + return; + + status = entryp->faus_ba[*entryp->faus_countp]; /* - * mark this cache entry as replacded + * For proxy, if there is no asru on the proxy side, but there is on + * the diag side, then take the diag side asru status. + * For diag, if there is an asru on the proxy side, then take the proxy + * side asru status. */ - rval = fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 0); + if (entryp->faus_is_proxy ? + (entryp->faus_diag_asru[*entryp->faus_countp] && + !entryp->faus_proxy_asru[*entryp->faus_countp]) : + entryp->faus_proxy_asru[*entryp->faus_countp]) { + if (status & FM_SUSPECT_DEGRADED) + alp->al_flags |= FMD_ASRU_DEGRADED; + else + alp->al_flags &= ~FMD_ASRU_DEGRADED; + if (status & FM_SUSPECT_UNUSABLE) + (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); + else + (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); + } /* - * now check if all entries associated with this asru are removed and - * if so replace containees + * Update the faulty status too. */ - (void) pthread_mutex_lock(&alp->al_asru->asru_lock); - flags = alp->al_asru->asru_flags; - (void) pthread_mutex_unlock(&alp->al_asru->asru_lock); - if (!(flags & (FMD_ASRU_FAULTY | FMD_ASRU_INVISIBLE))) - fmd_asru_al_hash_apply(fmd.d_asrus, fmd_asru_removed_containee, - alp->al_asru_fmri); - if (rval) - fmd_case_update(alp->al_case); + if (!(status & FM_SUSPECT_FAULTY)) + (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, + (status & FM_SUSPECT_REPAIRED) ? FMD_ASRU_REPAIRED : + (status & FM_SUSPECT_REPLACED) ? FMD_ASRU_REPLACED : + (status & FM_SUSPECT_ACQUITTED) ? FMD_ASRU_ACQUITTED : + FMD_ASRU_REMOVED); + else if (entryp->faus_is_proxy) + (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); + + /* + * for proxy only, update the present status too. + */ + if (entryp->faus_is_proxy) { + if (!(status & FM_SUSPECT_NOT_PRESENT)) { + alp->al_flags |= FMD_ASRU_PRESENT; + alp->al_asru->asru_flags |= FMD_ASRU_PRESENT; + } else { + alp->al_flags &= ~FMD_ASRU_PRESENT; + alp->al_asru->asru_flags &= ~FMD_ASRU_PRESENT; + } + } + (*entryp->faus_countp)++; +} + +/* + * This function is called on the diagnosing side when fault proxying is + * in use and the proxy has sent a uuclose. It updates the status of the + * resource cache entries. + */ +void +fmd_asru_close_status(fmd_asru_link_t *alp, void *arg) +{ + fmd_asru_close_status_t *entryp = (fmd_asru_close_status_t *)arg; + + if (*entryp->facs_countp >= entryp->facs_maxcount) + return; + alp->al_flags &= ~FMD_ASRU_DEGRADED; + (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); + (*entryp->facs_countp)++; } static void @@ -1330,6 +1432,12 @@ fmd_asru_logevent(fmd_asru_link_t *alp) cip = (fmd_case_impl_t *)alp->al_case; ASSERT(cip != NULL); + /* + * Don't log to disk on proxy side + */ + if (cip->ci_xprt != NULL) + return; + if ((lp = alp->al_log) == NULL) lp = fmd_log_open(ap->asru_root, alp->al_uuid, FMD_LOG_ASRU); @@ -1338,7 +1446,8 @@ fmd_asru_logevent(fmd_asru_link_t *alp) nvl = fmd_protocol_rsrc_asru(_fmd_asru_events[faulty | (unusable << 1)], alp->al_asru_fmri, cip->ci_uuid, cip->ci_code, faulty, unusable, - message, alp->al_event, &cip->ci_tv, repaired, replaced, acquitted); + message, alp->al_event, &cip->ci_tv, repaired, replaced, acquitted, + cip->ci_diag_de == NULL ? cip->ci_mod->mod_fmri : cip->ci_diag_de); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); @@ -1446,45 +1555,65 @@ fmd_asru_clrflags(fmd_asru_link_t *alp, uint_t sflag, uint8_t reason) int fmd_asru_al_getstate(fmd_asru_link_t *alp) { - int us, st; + int us, st = (alp->al_flags & (FMD_ASRU_FAULTY | FMD_ASRU_UNUSABLE)); nvlist_t *asru; - int ps; + int ps = FMD_OBJ_STATE_UNKNOWN; - ps = fmd_asru_replacement_state(alp->al_event); - if (ps == FMD_OBJ_STATE_NOT_PRESENT) - return ((alp->al_flags & FMD_ASRU_FAULTY) | FMD_ASRU_UNUSABLE); - if (ps == FMD_OBJ_STATE_REPLACED) { - if (alp->al_reason < FMD_ASRU_REPLACED) - alp->al_reason = FMD_ASRU_REPLACED; - return ((alp->al_flags & FMD_ASRU_FAULTY) | FMD_ASRU_UNUSABLE); + /* + * For fault proxying with an EXTERNAL transport, believe the presence + * state as sent by the diagnosing side. Otherwise find the presence + * state here. Note that if fault proxying with an INTERNAL transport + * we can only trust the presence state where we are using hc-scheme + * fmris which should be consistant across domains in the same system - + * other schemes can refer to different devices in different domains. + */ + if (!(alp->al_flags & FMD_ASRU_PROXY_EXTERNAL)) { + ps = fmd_asru_replacement_state(alp->al_event, (alp->al_flags & + FMD_ASRU_PROXY)? HC_ONLY_TRUE : HC_ONLY_FALSE); + if (ps == FMD_OBJ_STATE_NOT_PRESENT) + return (st | FMD_ASRU_UNUSABLE); + if (ps == FMD_OBJ_STATE_REPLACED) { + if (alp->al_reason < FMD_ASRU_REPLACED) + alp->al_reason = FMD_ASRU_REPLACED; + return (st | FMD_ASRU_UNUSABLE); + } } + if (ps == FMD_OBJ_STATE_UNKNOWN && (alp->al_flags & FMD_ASRU_PROXY)) + st |= (alp->al_flags & (FMD_ASRU_DEGRADED | FMD_ASRU_PRESENT)); + else + st |= (alp->al_flags & (FMD_ASRU_DEGRADED)) | FMD_ASRU_PRESENT; - st = (alp->al_flags & FMD_ASRU_STATE) | FMD_ASRU_PRESENT; - if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) == 0) { + /* + * For fault proxying, unless we have a local ASRU, then believe the + * service state sent by the diagnosing side. Otherwise find the service + * state here. Try fmd_fmri_service_state() first, but if that's not + * supported by the scheme then fall back to fmd_fmri_unusable(). + */ + if ((!(alp->al_flags & FMD_ASRU_PROXY) || + (alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) && + nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) == 0) { us = fmd_fmri_service_state(asru); if (us == -1 || us == FMD_SERVICE_STATE_UNKNOWN) { /* not supported by scheme - try fmd_fmri_unusable */ us = fmd_fmri_unusable(asru); - } else if (us == FMD_SERVICE_STATE_UNUSABLE) { - st |= FMD_ASRU_UNUSABLE; - return (st); - } else if (us == FMD_SERVICE_STATE_OK) { - st &= ~FMD_ASRU_UNUSABLE; - return (st); - } else if (us == FMD_SERVICE_STATE_ISOLATE_PENDING) { - st &= ~FMD_ASRU_UNUSABLE; - return (st); - } else if (us == FMD_SERVICE_STATE_DEGRADED) { - st &= ~FMD_ASRU_UNUSABLE; - st |= FMD_ASRU_DEGRADED; - return (st); + if (us > 0) + st |= FMD_ASRU_UNUSABLE; + else if (us == 0) + st &= ~FMD_ASRU_UNUSABLE; + } else { + if (us == FMD_SERVICE_STATE_UNUSABLE) { + st &= ~FMD_ASRU_DEGRADED; + st |= FMD_ASRU_UNUSABLE; + } else if (us == FMD_SERVICE_STATE_OK) { + st &= ~(FMD_ASRU_DEGRADED | FMD_ASRU_UNUSABLE); + } else if (us == FMD_SERVICE_STATE_ISOLATE_PENDING) { + st &= ~(FMD_ASRU_DEGRADED | FMD_ASRU_UNUSABLE); + } else if (us == FMD_SERVICE_STATE_DEGRADED) { + st &= ~FMD_ASRU_UNUSABLE; + st |= FMD_ASRU_DEGRADED; + } } - } else - us = (alp->al_flags & FMD_ASRU_UNUSABLE); - if (us > 0) - st |= FMD_ASRU_UNUSABLE; - else if (us == 0) - st &= ~FMD_ASRU_UNUSABLE; + } return (st); } @@ -1499,20 +1628,43 @@ fmd_asru_al_getstate(fmd_asru_link_t *alp) int fmd_asru_getstate(fmd_asru_t *ap) { - int us, st; - - if (!(ap->asru_flags & FMD_ASRU_INTERNAL) && - (fmd_asru_fake_not_present >= FMD_OBJ_STATE_REPLACED || - fmd_fmri_present(ap->asru_fmri) <= 0)) - return (0); /* do not report non-fmd non-present resources */ - - us = fmd_fmri_unusable(ap->asru_fmri); - st = ap->asru_flags & FMD_ASRU_STATE; + int us, st, p = -1; + char *s; - if (us > 0) - st |= FMD_ASRU_UNUSABLE; - else if (us == 0) - st &= ~FMD_ASRU_UNUSABLE; + /* do not report non-fmd non-present resources */ + if (!(ap->asru_flags & FMD_ASRU_INTERNAL)) { + /* + * As with fmd_asru_al_getstate(), we can only trust the + * local presence state on a proxy if the transport is + * internal and the scheme is hc. Otherwise we believe the + * state as sent by the diagnosing side. + */ + if (!(ap->asru_flags & FMD_ASRU_PROXY) || + (!(ap->asru_flags & FMD_ASRU_PROXY_EXTERNAL) && + (nvlist_lookup_string(ap->asru_fmri, FM_FMRI_SCHEME, + &s) == 0 && strcmp(s, FM_FMRI_SCHEME_HC) == 0))) { + if (fmd_asru_fake_not_present >= + FMD_OBJ_STATE_REPLACED) + return (0); + p = fmd_fmri_present(ap->asru_fmri); + } + if (p == 0 || (p < 0 && !(ap->asru_flags & FMD_ASRU_PROXY) || + !(ap->asru_flags & FMD_ASRU_PRESENT))) + return (0); + } + /* + * As with fmd_asru_al_getstate(), we can only trust the local unusable + * state on a proxy if there is a local ASRU. + */ + st = ap->asru_flags & (FMD_ASRU_FAULTY | FMD_ASRU_UNUSABLE); + if (!(ap->asru_flags & FMD_ASRU_PROXY) || + (ap->asru_flags & FMD_ASRU_PROXY_WITH_ASRU)) { + us = fmd_fmri_unusable(ap->asru_fmri); + if (us > 0) + st |= FMD_ASRU_UNUSABLE; + else if (us == 0) + st &= ~FMD_ASRU_UNUSABLE; + } return (st); } diff --git a/usr/src/cmd/fm/fmd/common/fmd_asru.h b/usr/src/cmd/fm/fmd/common/fmd_asru.h index f0a5738f3e..c350c801c0 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_asru.h +++ b/usr/src/cmd/fm/fmd/common/fmd_asru.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _FMD_ASRU_H #define _FMD_ASRU_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <pthread.h> @@ -100,11 +98,16 @@ typedef struct fmd_asru_link { #define FMD_ASRU_RECREATED 0x20 /* asru recreated by cache replay */ #define FMD_ASRU_PRESENT 0x40 /* asru present at last R$ update */ #define FMD_ASRU_DEGRADED 0x80 /* asru service is degraded */ +#define FMD_ASRU_PROXY 0x100 /* asru on proxy */ +#define FMD_ASRU_PROXY_WITH_ASRU 0x200 /* asru accessible locally on proxy */ +#define FMD_ASRU_PROXY_EXTERNAL 0x400 /* proxy over external transport */ +#define FMD_ASRU_PROXY_RDONLY 0x800 /* proxy over readonly transport */ /* * Note the following are defined in order of increasing precedence and * this should not be changed */ +#define FMD_ASRU_REMOVED 0 /* asru removed */ #define FMD_ASRU_ACQUITTED 1 /* asru acquitted */ #define FMD_ASRU_REPAIRED 2 /* asru repaired */ #define FMD_ASRU_REPLACED 3 /* asru replaced */ @@ -162,10 +165,56 @@ extern void fmd_asru_hash_release(fmd_asru_hash_t *, fmd_asru_t *); extern void fmd_asru_hash_delete_case(fmd_asru_hash_t *, fmd_case_t *); extern void fmd_asru_clear_aged_rsrcs(); + +/* + * flags used in fara_bywhat field in fmd_asru_rep_arg_t + */ +#define FARA_ALL 0 +#define FARA_BY_CASE 1 +#define FARA_BY_ASRU 2 +#define FARA_BY_FRU 3 +#define FARA_BY_RSRC 4 +#define FARA_BY_LABEL 5 + +/* + * The following structures are used to pass arguments to the corresponding + * function when walking the resource cache by case etc. + */ +typedef struct { + uint8_t fara_reason; /* repaired, acquit, replaced, removed */ + uint8_t fara_bywhat; /* whether doing a walk by case, asru, etc */ + int *fara_rval; /* for return success or failure */ + char *fara_uuid; /* uuid can be passed in for comparison */ +} fmd_asru_rep_arg_t; extern void fmd_asru_repaired(fmd_asru_link_t *, void *); -extern void fmd_asru_acquit(fmd_asru_link_t *, void *); -extern void fmd_asru_replaced(fmd_asru_link_t *, void *); -extern void fmd_asru_removed(fmd_asru_link_t *); + +typedef struct { + int *faus_countp; + int faus_maxcount; + uint8_t *faus_ba; /* received status for each suspect */ + uint8_t *faus_proxy_asru; /* asru on proxy for each suspect? */ + uint8_t *faus_diag_asru; /* asru on diag for each suspect? */ + boolean_t faus_is_proxy; /* are we on the proxy side? */ +} fmd_asru_update_status_t; +extern void fmd_asru_update_status(fmd_asru_link_t *alp, void *arg); + +typedef struct { + int *fasp_countp; + int fasp_maxcount; + uint8_t *fasp_proxy_asru; /* asru on proxy for each suspect? */ + int fasp_proxy_external; /* is this an external transport? */ + int fasp_proxy_rdonly; /* is this a rdonly transport? */ +} fmd_asru_set_on_proxy_t; +extern void fmd_asru_set_on_proxy(fmd_asru_link_t *alp, void *arg); + +extern void fmd_asru_update_containees(fmd_asru_link_t *alp, void *arg); + +typedef struct { + int *facs_countp; + int facs_maxcount; +} fmd_asru_close_status_t; +extern void fmd_asru_close_status(fmd_asru_link_t *alp, void *arg); + extern int fmd_asru_setflags(fmd_asru_link_t *, uint_t); extern int fmd_asru_clrflags(fmd_asru_link_t *, uint_t, uint8_t); extern int fmd_asru_al_getstate(fmd_asru_link_t *); diff --git a/usr/src/cmd/fm/fmd/common/fmd_case.c b/usr/src/cmd/fm/fmd/common/fmd_case.c index d45475e599..8a03b670eb 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_case.c +++ b/usr/src/cmd/fm/fmd/common/fmd_case.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -373,9 +373,20 @@ static void fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg) { int *rvalp = (int *)arg; - int state = fmd_asru_al_getstate(alp); + int state; nvlist_t *asru; + /* + * if this a proxy case and this suspect doesn't have an local asru + * then state is unknown so we must assume it may still be unusable. + */ + if ((alp->al_flags & FMD_ASRU_PROXY) && + !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) { + *rvalp |= B_TRUE; + return; + } + + state = fmd_asru_al_getstate(alp); if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0) return; *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT)); @@ -430,8 +441,13 @@ fmd_case_mkevent(fmd_case_t *cp, const char *class) if (msg == B_FALSE) cip->ci_flags |= FMD_CF_INVISIBLE; - nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid, - code, count, nva, ba, msg, &cip->ci_tv); + /* + * Use the ci_diag_de if one has been saved (eg for an injected fault). + * Otherwise use the authority for the current module. + */ + nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ? + cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count, + nva, ba, msg, &cip->ci_tv); (void) pthread_mutex_unlock(&cip->ci_lock); return (nvl); @@ -599,7 +615,10 @@ fmd_case_convict(fmd_case_t *cp) fmd_asru_link_t *alp; (void) pthread_mutex_lock(&cip->ci_lock); - (void) fmd_case_mkcode(cp); + if (cip->ci_code == NULL) + (void) fmd_case_mkcode(cp); + else if (cip->ci_precanned) + fmd_case_code_hash_insert(fmd.d_cases, cip); if (fmd_case_check_for_dups(cp) == 1) { (void) pthread_mutex_unlock(&cip->ci_lock); return (1); @@ -615,6 +634,8 @@ fmd_case_convict(fmd_case_t *cp) "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); continue; } + alp->al_flags |= FMD_ASRU_PRESENT; + alp->al_asru->asru_flags |= FMD_ASRU_PRESENT; (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); } @@ -641,7 +662,8 @@ fmd_case_publish(fmd_case_t *cp, uint_t state) /* * If we already have a code, then case is already solved. */ - if (cip->ci_code != NULL) { + if (cip->ci_precanned == 0 && cip->ci_xprt == NULL && + cip->ci_code != NULL) { (void) pthread_mutex_unlock(&cip->ci_lock); break; } @@ -657,6 +679,25 @@ fmd_case_publish(fmd_case_t *cp, uint_t state) fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); break; } + if (cip->ci_xprt != NULL) { + /* + * For proxy, save some information about the transport + * in the resource cache. + */ + int count = 0; + fmd_asru_set_on_proxy_t fasp; + fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt; + + fasp.fasp_countp = &count; + fasp.fasp_maxcount = cip->ci_nsuspects; + fasp.fasp_proxy_asru = cip->ci_proxy_asru; + fasp.fasp_proxy_external = xip->xi_flags & + FMD_XPRT_EXTERNAL; + fasp.fasp_proxy_rdonly = ((xip->xi_flags & + FMD_XPRT_RDWR) == FMD_XPRT_RDONLY); + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, + fmd_asru_set_on_proxy, &fasp); + } nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); @@ -867,6 +908,15 @@ fmd_case_destroy_suspects(fmd_case_impl_t *cip) ASSERT(MUTEX_HELD(&cip->ci_lock)); + if (cip->ci_proxy_asru) + fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) * + cip->ci_nsuspects); + if (cip->ci_diag_de) + nvlist_free(cip->ci_diag_de); + if (cip->ci_diag_asru) + fmd_free(cip->ci_diag_asru, sizeof (uint8_t) * + cip->ci_nsuspects); + for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { ncis = cis->cis_next; nvlist_free(cis->cis_nvl); @@ -921,12 +971,28 @@ fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, */ if (mp == fmd.d_rmod) { /* + * In case the case has already been created from + * a checkpoint file we need to set up code now. + */ + if (cip->ci_state < FMD_CASE_CLOSED) { + if (code != NULL && cip->ci_code == NULL) { + cip->ci_code = fmd_strdup(code, + FMD_SLEEP); + cip->ci_codelen = cip->ci_code ? + strlen(cip->ci_code) + 1 : 0; + fmd_case_code_hash_insert(fmd.d_cases, + cip); + } + } + + /* * When recreating an orphan case, state passed in may * either be CLOSED (faulty) or REPAIRED (!faulty). If * any suspects are still CLOSED (faulty) then the * overall state needs to be CLOSED. */ - if (state == FMD_CASE_CLOSED) + if (cip->ci_state == FMD_CASE_REPAIRED && + state == FMD_CASE_CLOSED) cip->ci_state = FMD_CASE_CLOSED; (void) pthread_mutex_unlock(&cip->ci_lock); fmd_case_rele((fmd_case_t *)cip); @@ -1202,7 +1268,8 @@ fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) cip->ci_nsuspects++; (void) pthread_mutex_unlock(&cip->ci_lock); - fmd_module_setcdirty(cip->ci_mod); + if (cip->ci_xprt == NULL) + fmd_module_setcdirty(cip->ci_mod); } void @@ -1213,9 +1280,6 @@ fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) boolean_t b; (void) pthread_mutex_lock(&cip->ci_lock); - ASSERT(cip->ci_state == FMD_CASE_CLOSED || - cip->ci_state == FMD_CASE_REPAIRED); - ASSERT(cip->ci_mod == fmd.d_rmod); cis->cis_next = cip->ci_suspects; cis->cis_nvl = nvl; @@ -1270,7 +1334,7 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) (void) pthread_mutex_lock(&cip->ci_lock); if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) - flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); + flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED); cip->ci_flags |= flags; @@ -1319,21 +1383,42 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) break; case FMD_CASE_REPAIRED: - ASSERT(fmd_case_orphaned(cp)); + ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp)); /* - * If all suspects are already either usable or not present then - * transition straight to RESOLVED state, publishing both the - * list.repaired and list.resolved. + * If we've been requested to transition straight on to the + * RESOLVED state (which can happen with fault proxying where a + * list.resolved or a uuresolved is received from the other + * side), or if all suspects are already either usable or not + * present then transition straight to RESOLVED state, + * publishing both the list.repaired and list.resolved. For a + * proxy, if we discover here that all suspects are already + * either usable or not present, notify the diag side instead + * using fmd_xprt_uuresolved(). */ - fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, - fmd_case_unusable_and_present, &any_unusable_and_present); - if (any_unusable_and_present) - break; + if (flags & FMD_CF_RESOLVED) { + if (cip->ci_xprt != NULL) { + fmd_list_delete(&cip->ci_mod->mod_cases, cip); + } else { + fmd_module_lock(cip->ci_mod); + fmd_list_delete(&cip->ci_mod->mod_cases, cip); + fmd_module_unlock(cip->ci_mod); + } + } else { + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, + fmd_case_unusable_and_present, + &any_unusable_and_present); + if (any_unusable_and_present) + break; + if (cip->ci_xprt != NULL) { + fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid); + break; + } + fmd_module_lock(cip->ci_mod); + fmd_list_delete(&cip->ci_mod->mod_cases, cip); + fmd_module_unlock(cip->ci_mod); + } - fmd_module_lock(cip->ci_mod); - fmd_list_delete(&cip->ci_mod->mod_cases, cip); - fmd_module_unlock(cip->ci_mod); cip->ci_state = FMD_CASE_RESOLVED; (void) pthread_mutex_unlock(&cip->ci_lock); fmd_case_publish(cp, state); @@ -1346,6 +1431,17 @@ fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) break; case FMD_CASE_RESOLVED: + /* + * For a proxy, no need to check that all suspects are already + * either usable or not present - this request has come from + * the diagnosing side which makes the final decision on this. + */ + if (cip->ci_xprt != NULL) { + fmd_list_delete(&cip->ci_mod->mod_cases, cip); + resolved = 1; + break; + } + ASSERT(fmd_case_orphaned(cp)); /* @@ -1473,6 +1569,100 @@ fmd_case_commit(fmd_case_t *cp) } /* + * On proxy side, send back repair/acquit/etc request to diagnosing side + */ +void +fmd_case_xprt_updated(fmd_case_t *cp) +{ + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + nvlist_t **nva; + uint8_t *ba; + int msg = B_TRUE; + int count = 0; + fmd_case_lst_t fcl; + + ASSERT(cip->ci_xprt != NULL); + (void) pthread_mutex_lock(&cip->ci_lock); + ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); + nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); + fcl.fcl_countp = &count; + fcl.fcl_maxcount = cip->ci_nsuspects; + fcl.fcl_msgp = &msg; + fcl.fcl_ba = ba; + fcl.fcl_nva = nva; + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); + (void) pthread_mutex_unlock(&cip->ci_lock); + fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru, + count); +} + +/* + * fmd_case_update_status() can be called on either the proxy side when a + * list.suspect is received, or on the diagnosing side when an update request + * is received from the proxy. It updates the status in the resource cache. + */ +void +fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup, + uint8_t *diag_asrup) +{ + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + int count = 0; + fmd_asru_update_status_t faus; + + /* + * update status of resource cache entries + */ + faus.faus_countp = &count; + faus.faus_maxcount = cip->ci_nsuspects; + faus.faus_ba = statusp; + faus.faus_proxy_asru = proxy_asrup; + faus.faus_diag_asru = diag_asrup; + faus.faus_is_proxy = (cip->ci_xprt != NULL); + (void) pthread_mutex_lock(&cip->ci_lock); + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status, + &faus); + (void) pthread_mutex_unlock(&cip->ci_lock); +} + +/* + * Called on either the proxy side or the diag side when a repair has taken + * place on the other side but this side may know the asru "contains" + * relationships. + */ +void +fmd_case_update_containees(fmd_case_t *cp) +{ + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + + (void) pthread_mutex_lock(&cip->ci_lock); + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, + fmd_asru_update_containees, NULL); + (void) pthread_mutex_unlock(&cip->ci_lock); +} + +/* + * fmd_case_close_status() is called on diagnosing side when proxy side + * has had a uuclose. It updates the status in the resource cache. + */ +void +fmd_case_close_status(fmd_case_t *cp) +{ + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + int count = 0; + fmd_asru_close_status_t facs; + + /* + * update status of resource cache entries + */ + facs.facs_countp = &count; + facs.facs_maxcount = cip->ci_nsuspects; + (void) pthread_mutex_lock(&cip->ci_lock); + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status, + &facs); + (void) pthread_mutex_unlock(&cip->ci_lock); +} + +/* * Indicate that the case may need to change state because one or more of the * ASRUs named as a suspect has changed state. We examine all the suspects * and if none are still faulty, we initiate a case close transition. @@ -1487,7 +1677,7 @@ fmd_case_update(fmd_case_t *cp) (void) pthread_mutex_lock(&cip->ci_lock); cstate = cip->ci_state; - if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { + if (cip->ci_state < FMD_CASE_SOLVED) { (void) pthread_mutex_unlock(&cip->ci_lock); return; /* update is not appropriate */ } @@ -1497,6 +1687,7 @@ fmd_case_update(fmd_case_t *cp) return; /* already repaired */ } + TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid)); fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); (void) pthread_mutex_unlock(&cip->ci_lock); @@ -1505,6 +1696,7 @@ fmd_case_update(fmd_case_t *cp) fmd_event_t *e; char *class; + TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid)); nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); @@ -1525,7 +1717,7 @@ fmd_case_update(fmd_case_t *cp) * Delete a closed case from the module's case list once the fmdo_close() entry * point has run to completion. If the case is owned by a transport module, * tell the transport to proxy a case close on the other end of the transport. - * If not, transition to the appropriate next state based on ci_flags. This + * Transition to the appropriate next state based on ci_flags. This * function represents the end of CLOSE_WAIT and transitions the case to either * CLOSED or REPAIRED or discards it entirely because it was never solved; * refer to the topmost block comment explaining the state machine for details. @@ -1537,6 +1729,7 @@ fmd_case_delete(fmd_case_t *cp) fmd_modstat_t *msp; size_t buftotal; + TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid)); ASSERT(fmd_module_locked(cip->ci_mod)); fmd_list_delete(&cip->ci_mod->mod_cases, cip); buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); @@ -1560,11 +1753,11 @@ fmd_case_delete(fmd_case_t *cp) fmd_module_hold(cip->ci_mod); /* - * If the case is not proxied and it has been solved, then retain it + * If the case has been solved, then retain it * on the root module's case list at least until we're transitioned. * Otherwise free the case with our final fmd_case_rele() below. */ - if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { + if (cip->ci_flags & FMD_CF_SOLVED) { fmd_module_lock(cip->ci_mod); fmd_list_append(&cip->ci_mod->mod_cases, cip); fmd_module_unlock(cip->ci_mod); @@ -1572,22 +1765,26 @@ fmd_case_delete(fmd_case_t *cp) } /* - * If a proxied case finishes CLOSE_WAIT, then it can be discarded - * rather than orphaned because by definition it can have no entries - * in the resource cache of the current fault manager. + * Transition onwards to REPAIRED or CLOSED as originally requested. + * Note that for proxy case if we're transitioning to CLOSED it means + * the case was isolated locally, so call fmd_xprt_uuclose() to notify + * the diagnosing side. No need to notify the diagnosing side if we are + * transitioning to REPAIRED as we only do this when requested to do + * so by the diagnosing side anyway. */ - if (cip->ci_xprt != NULL) - fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); - else if (cip->ci_flags & FMD_CF_REPAIRED) + if (cip->ci_flags & FMD_CF_REPAIRED) fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); - else if (cip->ci_flags & FMD_CF_ISOLATED) + else if (cip->ci_flags & FMD_CF_ISOLATED) { fmd_case_transition(cp, FMD_CASE_CLOSED, 0); + if (cip->ci_xprt != NULL) + fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); + } fmd_case_rele(cp); } void -fmd_case_discard(fmd_case_t *cp) +fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache) { fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; @@ -1597,6 +1794,11 @@ fmd_case_discard(fmd_case_t *cp) ASSERT(fmd_module_locked(cip->ci_mod)); fmd_list_delete(&cip->ci_mod->mod_cases, cip); + if (delete_from_asru_cache) { + (void) pthread_mutex_lock(&cip->ci_lock); + fmd_asru_hash_delete_case(fmd.d_asrus, cp); + (void) pthread_mutex_unlock(&cip->ci_lock); + } fmd_case_rele(cp); } @@ -1612,15 +1814,11 @@ fmd_case_repair(fmd_case_t *cp) { fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; uint_t cstate; + fmd_asru_rep_arg_t fara; (void) pthread_mutex_lock(&cip->ci_lock); cstate = cip->ci_state; - if (cip->ci_xprt != NULL) { - (void) pthread_mutex_unlock(&cip->ci_lock); - return (fmd_set_errno(EFMD_CASE_OWNER)); - } - if (cstate < FMD_CASE_SOLVED) { (void) pthread_mutex_unlock(&cip->ci_lock); return (fmd_set_errno(EFMD_CASE_STATE)); @@ -1631,9 +1829,23 @@ fmd_case_repair(fmd_case_t *cp) return (0); /* already repaired */ } - fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, NULL); + TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid)); + fara.fara_reason = FMD_ASRU_REPAIRED; + fara.fara_bywhat = FARA_BY_CASE; + fara.fara_rval = NULL; + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); (void) pthread_mutex_unlock(&cip->ci_lock); + /* + * if this is a proxied case, send the repair across the transport. + * The remote side will then do the repair and send a list.repaired back + * again such that we can finally repair the case on this side. + */ + if (cip->ci_xprt != NULL) { + fmd_case_xprt_updated(cp); + return (0); + } + if (cstate == FMD_CASE_CLOSED) fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); else @@ -1647,15 +1859,11 @@ fmd_case_acquit(fmd_case_t *cp) { fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; uint_t cstate; + fmd_asru_rep_arg_t fara; (void) pthread_mutex_lock(&cip->ci_lock); cstate = cip->ci_state; - if (cip->ci_xprt != NULL) { - (void) pthread_mutex_unlock(&cip->ci_lock); - return (fmd_set_errno(EFMD_CASE_OWNER)); - } - if (cstate < FMD_CASE_SOLVED) { (void) pthread_mutex_unlock(&cip->ci_lock); return (fmd_set_errno(EFMD_CASE_STATE)); @@ -1666,9 +1874,23 @@ fmd_case_acquit(fmd_case_t *cp) return (0); /* already repaired */ } - fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_acquit, NULL); + TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid)); + fara.fara_reason = FMD_ASRU_ACQUITTED; + fara.fara_bywhat = FARA_BY_CASE; + fara.fara_rval = NULL; + fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); (void) pthread_mutex_unlock(&cip->ci_lock); + /* + * if this is a proxied case, send the repair across the transport. + * The remote side will then do the repair and send a list.repaired back + * again such that we can finally repair the case on this side. + */ + if (cip->ci_xprt != NULL) { + fmd_case_xprt_updated(cp); + return (0); + } + if (cstate == FMD_CASE_CLOSED) fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); else @@ -1722,6 +1944,25 @@ fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; } +void +fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl) +{ + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + + if (cip->ci_diag_de) + nvlist_free(cip->ci_diag_de); + cip->ci_diag_de = nvl; +} + +void +fmd_case_setcode(fmd_case_t *cp, char *code) +{ + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + + cip->ci_code = fmd_strdup(code, FMD_SLEEP); + cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; +} + /*ARGSUSED*/ void fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) @@ -1734,7 +1975,7 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) int any_unusable_and_present = 0; fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; - if (cip->ci_state < FMD_CASE_SOLVED) + if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL) return; fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); @@ -1755,18 +1996,24 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) fmd_module_unlock(cip->ci_mod); cip->ci_state = FMD_CASE_RESOLVED; + TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", + cip->ci_uuid)); nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); fmd_dispq_dispatch(fmd.d_disp, e, class); + TRACE((FMD_DBG_CASE, "replay sending list.resolved %s", + cip->ci_uuid)); fmd_case_publish(cp, FMD_CASE_RESOLVED); (void) pthread_mutex_lock(&cip->ci_lock); fmd_asru_hash_delete_case(fmd.d_asrus, cp); (void) pthread_mutex_unlock(&cip->ci_lock); fmd_case_rele(cp); } else { + TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", + cip->ci_uuid)); nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, @@ -1778,6 +2025,8 @@ fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) * if some but not all of the suspects are not faulty, replay * the list.updated. */ + TRACE((FMD_DBG_CASE, "replay sending list.updated %s", + cip->ci_uuid)); nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); diff --git a/usr/src/cmd/fm/fmd/common/fmd_case.h b/usr/src/cmd/fm/fmd/common/fmd_case.h index a635173795..354e3f35a1 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_case.h +++ b/usr/src/cmd/fm/fmd/common/fmd_case.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _FMD_CASE_H #define _FMD_CASE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <pthread.h> #include <libnvpair.h> @@ -62,6 +60,10 @@ typedef struct fmd_case_impl { size_t ci_codelen; /* size of ci_code buffer in bytes */ struct fmd_module *ci_mod; /* module that owns this case */ fmd_xprt_t *ci_xprt; /* transport for this case (or NULL) */ + uint8_t ci_precanned; /* precanned code from injection */ + nvlist_t *ci_diag_de; /* diag side de fmri */ + uint8_t *ci_diag_asru; /* is asru valid on diag side */ + uint8_t *ci_proxy_asru; /* is asru valid on proxy side */ void *ci_data; /* data from fmd_case_setspecific() */ pthread_mutex_t ci_lock; /* lock for remainder of contents */ uint_t ci_refs; /* reference count */ @@ -91,10 +93,18 @@ typedef struct fmd_case_impl { #define FMD_CF_SOLVED 0x02 /* case has been solved */ #define FMD_CF_ISOLATED 0x04 /* case has been isolated */ #define FMD_CF_REPAIRED 0x08 /* case has been repaired */ -#define FMD_CF_REPAIRING 0x10 /* case repair in progress */ +#define FMD_CF_RESOLVED 0x10 /* case has been resolved */ #define FMD_CF_INVISIBLE 0x20 /* case should be invisible */ #define FMD_CF_DELETING 0x40 /* case is about to be deleted */ +/* + * ci_proxy_asru flags record if we created a new asru on the proxy side and + * if so whether it is derived from the received asru or received resource. + */ +#define FMD_PROXY_ASRU_NOT_NEEDED 0 +#define FMD_PROXY_ASRU_FROM_ASRU 1 +#define FMD_PROXY_ASRU_FROM_RSRC 2 + typedef struct fmd_case_hash { pthread_rwlock_t ch_lock; /* lock protecting case hash */ fmd_case_impl_t **ch_hash; /* hash bucket array for cases */ @@ -135,8 +145,15 @@ extern void fmd_case_clrdirty(fmd_case_t *); extern void fmd_case_commit(fmd_case_t *); extern void fmd_case_update(fmd_case_t *); extern void fmd_case_delete(fmd_case_t *); -extern void fmd_case_discard(fmd_case_t *); +extern void fmd_case_discard(fmd_case_t *, boolean_t); extern void fmd_case_settime(fmd_case_t *, time_t, suseconds_t); +extern void fmd_case_setcode(fmd_case_t *, char *); +extern void fmd_case_set_de_fmri(fmd_case_t *, nvlist_t *); +extern void fmd_case_update_status(fmd_case_t *, uint8_t *, uint8_t *, + uint8_t *); +extern void fmd_case_update_containees(fmd_case_t *); +extern void fmd_case_xprt_updated(fmd_case_t *); +extern void fmd_case_close_status(fmd_case_t *); extern int fmd_case_repair(fmd_case_t *); extern int fmd_case_acquit(fmd_case_t *); diff --git a/usr/src/cmd/fm/fmd/common/fmd_ckpt.c b/usr/src/cmd/fm/fmd/common/fmd_ckpt.c index 4c7a645be5..4bae10f7b4 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_ckpt.c +++ b/usr/src/cmd/fm/fmd/common/fmd_ckpt.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1057,11 +1057,19 @@ fmd_ckpt_restore_case(fmd_ckpt_t *ckp, fmd_module_t *mp, const fcf_sec_t *sp) /* * Once solved, treat suspects from resource cache as master copy. + * + * If !fmd.d_running, this module must be a builtin, and so we don't + * want to restore suspects or call fmd_case_transition_update() at this + * stage. The suspects will be added later from the resource cache. + * Calling fmd_case_transition("SOLVED") is OK here as the state is + * already solved, so all it does is update the case flags. */ - if ((n = ((fmd_case_impl_t *)cp)->ci_nsuspects) == 0) + if (fmd.d_running && (n = ((fmd_case_impl_t *)cp)->ci_nsuspects) == 0) n = fmd_ckpt_restore_suspects(ckp, cp, fcfc->fcfc_suspects); - if (fcfc->fcfc_state == FCF_CASE_SOLVED) + if (!fmd.d_running) + fmd_case_transition(cp, FMD_CASE_SOLVED, FMD_CF_SOLVED); + else if (fcfc->fcfc_state == FCF_CASE_SOLVED) fmd_case_transition_update(cp, FMD_CASE_SOLVED, FMD_CF_SOLVED); else if (fcfc->fcfc_state == FCF_CASE_CLOSE_WAIT && n != 0) fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_SOLVED); diff --git a/usr/src/cmd/fm/fmd/common/fmd_dispq.c b/usr/src/cmd/fm/fmd/common/fmd_dispq.c index 8519a4475a..1aa0e0dbd7 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_dispq.c +++ b/usr/src/cmd/fm/fmd/common/fmd_dispq.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/fm/protocol.h> #include <sys/bitmap.h> @@ -330,6 +328,7 @@ fmd_dispq_dispatch_gid(fmd_dispq_t *dqp, if (FMD_EVENT_TYPE(ep) == FMD_EVT_PROTOCOL && (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 || + strcmp(class, FM_LIST_RESOLVED_CLASS) == 0 || strcmp(class, FM_LIST_UPDATED_CLASS) == 0) && nvlist_lookup_nvlist_array(FMD_EVENT_NVL(ep), FM_SUSPECT_FAULT_LIST, &nva, &nvc) == 0) { diff --git a/usr/src/cmd/fm/fmd/common/fmd_mdb.c b/usr/src/cmd/fm/fmd/common/fmd_mdb.c index ea5b553e10..a9e89d37bc 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_mdb.c +++ b/usr/src/cmd/fm/fmd/common/fmd_mdb.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/mdb_modapi.h> #include <limits.h> @@ -1053,6 +1051,63 @@ fmd_asru(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } +static int +al_walk_init(mdb_walk_state_t *wsp) +{ + fmd_asru_hash_t ah; + fmd_t F; + + if (wsp->walk_addr == NULL && mdb_readvar(&F, "fmd") != sizeof (F)) { + mdb_warn("failed to read fmd meta-data"); + return (WALK_ERR); + } + + if (wsp->walk_addr == NULL) + wsp->walk_addr = (uintptr_t)F.d_asrus; + + if (mdb_vread(&ah, sizeof (ah), wsp->walk_addr) != sizeof (ah)) { + mdb_warn("failed to read asru_hash at %p", wsp->walk_addr); + return (WALK_ERR); + } + + return (hash_walk_init(wsp, (uintptr_t)ah.ah_rsrc_hash, ah.ah_hashlen, + "fmd_asru_link", sizeof (fmd_asru_link_t), OFFSETOF(fmd_asru_link_t, + al_rsrc_next))); +} + +static int +fmd_asru_link(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + char uuid[48], name[PATH_MAX]; + fmd_asru_link_t a; + + if (!(flags & DCMD_ADDRSPEC)) { + if (mdb_walk_dcmd("fmd_asru_link", "fmd_asru_link", argc, + argv) != 0) { + mdb_warn("failed to walk fmd_asru_link hash"); + return (DCMD_ERR); + } + return (DCMD_OK); + } + + if (mdb_vread(&a, sizeof (a), addr) != sizeof (a)) { + mdb_warn("failed to read fmd_asru_link at %p", addr); + return (DCMD_ERR); + } + + if (DCMD_HDRSPEC(flags)) + mdb_printf("%<u>%-8s %-36s %s%</u>\n", "ADDR", "UUID", "NAME"); + + if (mdb_readstr(uuid, sizeof (uuid), (uintptr_t)a.al_uuid) <= 0) + (void) mdb_snprintf(uuid, sizeof (uuid), "<%p>", a.al_uuid); + if (mdb_readstr(name, sizeof (name), (uintptr_t)a.al_rsrc_name) <= 0) + (void) mdb_snprintf(name, sizeof (name), "<%p>", + a.al_rsrc_name); + + mdb_printf("%-8p %-36s %s\n", addr, uuid, name); + return (DCMD_OK); +} + /*ARGSUSED*/ static int fcf_hdr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) @@ -1522,6 +1577,7 @@ static const mdb_dcmd_t dcmds[] = { { "fmd_buf", ":", "display buffer structure", fmd_buf }, { "fmd_serd", "[:]", "display serd engine structure", fmd_serd }, { "fmd_asru", "?", "display asru resource structure", fmd_asru }, + { "fmd_asru_link", "?", "display resource structure", fmd_asru_link }, { "fmd_timer", "?", "display pending timer(s)", fmd_timer }, { "fmd_xprt", "?[-lrsu]", "display event transport(s)", fmd_xprt }, { NULL } @@ -1546,6 +1602,8 @@ static const mdb_walker_t walkers[] = { serd_walk_init, hash_walk_step, hash_walk_fini }, { "fmd_asru", "walk asru resource hash", asru_walk_init, hash_walk_step, hash_walk_fini }, + { "fmd_asru_link", "walk resource hash", + al_walk_init, hash_walk_step, hash_walk_fini }, { "fmd_timerq", "walk timer queue", tmq_walk_init, tmq_walk_step, NULL }, { "fmd_xprt", "walk per-module list of transports", diff --git a/usr/src/cmd/fm/fmd/common/fmd_protocol.c b/usr/src/cmd/fm/fmd/common/fmd_protocol.c index 7064af0164..543a7786bb 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_protocol.c +++ b/usr/src/cmd/fm/fmd/common/fmd_protocol.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/fm/protocol.h> #include <strings.h> #include <alloca.h> @@ -183,7 +181,7 @@ fmd_protocol_rsrc_asru(const char *class, nvlist_t *fmri, const char *uuid, const char *code, boolean_t faulty, boolean_t unusable, boolean_t message, nvlist_t *event, struct timeval *tvp, boolean_t repaired, boolean_t replaced, - boolean_t acquitted) + boolean_t acquitted, nvlist_t *diag_de) { nvlist_t *nvl; int64_t tod[2]; @@ -214,6 +212,9 @@ fmd_protocol_rsrc_asru(const char *class, err |= nvlist_add_boolean_value(nvl, FM_SUSPECT_MESSAGE, message); err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); + if (diag_de != NULL) + err |= nvlist_add_nvlist(nvl, FM_SUSPECT_DE, diag_de); + if (event != NULL) err |= nvlist_add_nvlist(nvl, FM_RSRC_ASRU_EVENT, event); @@ -335,3 +336,35 @@ fmd_protocol_xprt_uuclose(fmd_module_t *mp, const char *class, uint8_t version, return (nvl); } + +nvlist_t * +fmd_protocol_xprt_uuresolved(fmd_module_t *mp, const char *class, + uint8_t version, const char *uuid) +{ + nvlist_t *nvl = fmd_protocol_xprt_ctl(mp, class, version); + int err = nvlist_add_string(nvl, FM_RSRC_XPRT_UUID, uuid); + + if (err != 0) + fmd_panic("failed to populate nvlist: %s\n", fmd_strerror(err)); + + return (nvl); +} + +nvlist_t * +fmd_protocol_xprt_updated(fmd_module_t *mp, const char *class, uint8_t version, + const char *uuid, uint8_t *statusp, uint8_t *has_asrup, uint_t nelem) +{ + nvlist_t *nvl = fmd_protocol_xprt_ctl(mp, class, version); + int err = nvlist_add_string(nvl, FM_RSRC_XPRT_UUID, uuid); + + err |= nvlist_add_uint8_array(nvl, FM_RSRC_XPRT_FAULT_STATUS, statusp, + nelem); + if (has_asrup) + err |= nvlist_add_uint8_array(nvl, FM_RSRC_XPRT_FAULT_HAS_ASRU, + has_asrup, nelem); + + if (err != 0) + fmd_panic("failed to populate nvlist: %s\n", fmd_strerror(err)); + + return (nvl); +} diff --git a/usr/src/cmd/fm/fmd/common/fmd_protocol.h b/usr/src/cmd/fm/fmd/common/fmd_protocol.h index 68f2196b18..927a875ec3 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_protocol.h +++ b/usr/src/cmd/fm/fmd/common/fmd_protocol.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _FMD_PROTOCOL_H #define _FMD_PROTOCOL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/fm/protocol.h> #include <libnvpair.h> #include <stdarg.h> @@ -76,7 +74,7 @@ extern nvlist_t *fmd_protocol_list(const char *, nvlist_t *, struct timeval *); extern nvlist_t *fmd_protocol_rsrc_asru(const char *, nvlist_t *, const char *, const char *, boolean_t, boolean_t, boolean_t, nvlist_t *, - struct timeval *m, boolean_t, boolean_t, boolean_t); + struct timeval *m, boolean_t, boolean_t, boolean_t, nvlist_t *); extern nvlist_t *fmd_protocol_fmderror(int, const char *, va_list); extern nvlist_t *fmd_protocol_moderror(struct fmd_module *, int, const char *); extern nvlist_t *fmd_protocol_xprt_ctl(struct fmd_module *, @@ -85,6 +83,10 @@ extern nvlist_t *fmd_protocol_xprt_sub(struct fmd_module *, const char *, uint8_t, const char *); extern nvlist_t *fmd_protocol_xprt_uuclose(struct fmd_module *, const char *, uint8_t, const char *); +extern nvlist_t *fmd_protocol_xprt_uuresolved(struct fmd_module *, + const char *, uint8_t, const char *); +extern nvlist_t *fmd_protocol_xprt_updated(struct fmd_module *, + const char *, uint8_t, const char *, uint8_t *, uint8_t *, uint_t); #ifdef __cplusplus } diff --git a/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c b/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c index 2987849868..07e43656b2 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c +++ b/usr/src/cmd/fm/fmd/common/fmd_rpc_adm.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <strings.h> #include <limits.h> #include <unistd.h> @@ -465,89 +463,69 @@ fmd_adm_rsrcinfo_1_svc(char *fmri, return (TRUE); } -bool_t -fmd_adm_rsrcflush_1_svc(char *name, int *rvp, struct svc_req *req) -{ - return (fmd_adm_rsrcrepaired_1_svc(name, rvp, req)); -} - -bool_t -fmd_adm_rsrcrepaired_1_svc(char *name, int *rvp, struct svc_req *req) +static void +fmd_adm_do_repair(char *name, struct svc_req *req, int *errp, uint8_t reason, + char *uuid) { - int err = FMD_ADM_ERR_RSRCNOTF; - if (fmd_rpc_deny(req)) - err = FMD_ADM_ERR_PERM; + *errp = FMD_ADM_ERR_PERM; else { + fmd_asru_rep_arg_t fara; + + fara.fara_reason = reason; + fara.fara_rval = errp; + fara.fara_uuid = uuid; + fara.fara_bywhat = FARA_BY_ASRU; fmd_asru_hash_apply_by_asru(fmd.d_asrus, name, - fmd_asru_repaired, &err); + fmd_asru_repaired, &fara); + fara.fara_bywhat = FARA_BY_LABEL; fmd_asru_hash_apply_by_label(fmd.d_asrus, name, - fmd_asru_repaired, &err); + fmd_asru_repaired, &fara); + fara.fara_bywhat = FARA_BY_FRU; fmd_asru_hash_apply_by_fru(fmd.d_asrus, name, - fmd_asru_repaired, &err); + fmd_asru_repaired, &fara); + fara.fara_bywhat = FARA_BY_RSRC; fmd_asru_hash_apply_by_rsrc(fmd.d_asrus, name, - fmd_asru_repaired, &err); + fmd_asru_repaired, &fara); } +} + +bool_t +fmd_adm_rsrcflush_1_svc(char *name, int *rvp, struct svc_req *req) +{ + int err = FMD_ADM_ERR_RSRCNOTF; + + fmd_adm_do_repair(name, req, &err, FMD_ASRU_REPAIRED, NULL); *rvp = err; return (TRUE); } bool_t -fmd_adm_rsrcreplaced_1_svc(char *name, int *rvp, struct svc_req *req) +fmd_adm_rsrcrepaired_1_svc(char *name, int *rvp, struct svc_req *req) { int err = FMD_ADM_ERR_RSRCNOTF; - if (fmd_rpc_deny(req)) - err = FMD_ADM_ERR_PERM; - else { - fmd_asru_hash_apply_by_asru(fmd.d_asrus, name, - fmd_asru_replaced, &err); - fmd_asru_hash_apply_by_label(fmd.d_asrus, name, - fmd_asru_replaced, &err); - fmd_asru_hash_apply_by_fru(fmd.d_asrus, name, - fmd_asru_replaced, &err); - fmd_asru_hash_apply_by_rsrc(fmd.d_asrus, name, - fmd_asru_replaced, &err); - } + fmd_adm_do_repair(name, req, &err, FMD_ASRU_REPAIRED, NULL); *rvp = err; return (TRUE); } -typedef struct { - int *errp; - char *uuid; -} fmd_adm_ra_t; - -void -fmd_asru_ra_cb(fmd_asru_link_t *alp, void *arg) +bool_t +fmd_adm_rsrcreplaced_1_svc(char *name, int *rvp, struct svc_req *req) { - fmd_adm_ra_t *farap = (fmd_adm_ra_t *)arg; + int err = FMD_ADM_ERR_RSRCNOTF; - if (strcmp(farap->uuid, "") == 0 || - strcmp(farap->uuid, alp->al_case_uuid) == 0) - fmd_asru_acquit(alp, farap->errp); + fmd_adm_do_repair(name, req, &err, FMD_ASRU_REPLACED, NULL); + *rvp = err; + return (TRUE); } bool_t fmd_adm_rsrcacquit_1_svc(char *name, char *uuid, int *rvp, struct svc_req *req) { int err = FMD_ADM_ERR_RSRCNOTF; - fmd_adm_ra_t fara; - if (fmd_rpc_deny(req)) - err = FMD_ADM_ERR_PERM; - else { - fara.errp = &err; - fara.uuid = uuid; - fmd_asru_hash_apply_by_asru(fmd.d_asrus, name, - fmd_asru_ra_cb, &fara); - fmd_asru_hash_apply_by_label(fmd.d_asrus, name, - fmd_asru_ra_cb, &fara); - fmd_asru_hash_apply_by_fru(fmd.d_asrus, name, - fmd_asru_ra_cb, &fara); - fmd_asru_hash_apply_by_rsrc(fmd.d_asrus, name, - fmd_asru_ra_cb, &fara); - } + fmd_adm_do_repair(name, req, &err, FMD_ASRU_ACQUITTED, uuid); *rvp = err; return (TRUE); } diff --git a/usr/src/cmd/fm/fmd/common/fmd_sysevent.c b/usr/src/cmd/fm/fmd/common/fmd_sysevent.c index 683ef1341a..658ca43a1f 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_sysevent.c +++ b/usr/src/cmd/fm/fmd/common/fmd_sysevent.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/sysevent/eventdefs.h> #include <sys/sysevent.h> #include <sys/sysevent_impl.h> @@ -464,7 +462,8 @@ sysev_init(fmd_hdl_t *hdl) "channel %s", sysev_channel); } - sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL); + sysev_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY | + FMD_XPRT_CACHE_AS_LOCAL, NULL, NULL); sysev_hdl = hdl; /* diff --git a/usr/src/cmd/fm/fmd/common/fmd_xprt.c b/usr/src/cmd/fm/fmd/common/fmd_xprt.c index 0a4cf885de..323315cfa3 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_xprt.c +++ b/usr/src/cmd/fm/fmd/common/fmd_xprt.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -101,6 +101,8 @@ * FMA Class Payload * --------- ------- * resource.fm.xprt.uuclose string (uuid of case) + * resource.fm.xprt.uuresolved string (uuid of case) + * resource.fm.xprt.updated string (uuid of case) * resource.fm.xprt.subscribe string (class pattern) * resource.fm.xprt.unsubscribe string (class pattern) * resource.fm.xprt.unsuback string (class pattern) @@ -176,6 +178,8 @@ const fmd_xprt_rule_t _fmd_xprt_state_run[] = { { "resource.fm.xprt.unsubscribe", fmd_xprt_event_unsub }, { "resource.fm.xprt.unsuback", fmd_xprt_event_unsuback }, { "resource.fm.xprt.uuclose", fmd_xprt_event_uuclose }, +{ "resource.fm.xprt.uuresolved", fmd_xprt_event_uuresolved }, +{ "resource.fm.xprt.updated", fmd_xprt_event_updated }, { "resource.fm.xprt.*", fmd_xprt_event_error }, { NULL, NULL } }; @@ -510,8 +514,8 @@ fmd_xprt_send_case(fmd_case_t *cp, void *arg) nvlist_t *nvl; char *class; - if (cip->ci_state != FMD_CASE_SOLVED) - return; /* unsolved, or we'll get it during the ASRU pass */ + if (cip->ci_state == FMD_CASE_UNSOLVED) + return; nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); (void) nvlist_lookup_string(nvl, FM_CLASS, &class); @@ -523,49 +527,12 @@ fmd_xprt_send_case(fmd_case_t *cp, void *arg) fmd_dispq_dispatch_gid(fmd.d_disp, e, class, xip->xi_queue->eq_sgid); } -/* - * Upon transition to RUN, we take every ASRU which is in the degraded state - * and resend a fault.* event for it to our remote peer, in case the peer is - * running in the fault manager that knows how to disable this resource. If - * any new resources are added to the cache during our iteration, this is no - * problem because our subscriptions are already proxied and so any new cases - * will result in a list.suspect event being transported if that is needed. - */ -static void -fmd_xprt_send_asru(fmd_asru_t *ap, void *arg) -{ - fmd_xprt_impl_t *xip = arg; - nvlist_t *nvl = NULL; - fmd_event_t *e; - char *class; - - (void) pthread_mutex_lock(&ap->asru_lock); - - if ((ap->asru_flags & (FMD_ASRU_INTERNAL | FMD_ASRU_STATE)) == - FMD_ASRU_FAULTY && fmd_case_orphaned(ap->asru_case)) - (void) nvlist_xdup(ap->asru_event, &nvl, &fmd.d_nva); - - (void) pthread_mutex_unlock(&ap->asru_lock); - - if (nvl == NULL) - return; /* asru is internal, unusable, or not faulty */ - - (void) nvlist_lookup_string(nvl, FM_CLASS, &class); - e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); - - fmd_dprintf(FMD_DBG_XPRT, "re-send %s for %s to transport %u\n", - class, ap->asru_name, xip->xi_id); - - fmd_dispq_dispatch_gid(fmd.d_disp, e, class, xip->xi_queue->eq_sgid); -} - void fmd_xprt_event_run(fmd_xprt_impl_t *xip, nvlist_t *nvl) { if (!fmd_xprt_vmismatch(xip, nvl, NULL)) { fmd_xprt_transition(xip, _fmd_xprt_state_run, "RUN"); fmd_case_hash_apply(fmd.d_cases, fmd_xprt_send_case, xip); - fmd_asru_hash_apply(fmd.d_asrus, fmd_xprt_send_asru, xip); } } @@ -633,6 +600,9 @@ fmd_xprt_event_unsuback(fmd_xprt_impl_t *xip, nvlist_t *nvl) (void) pthread_mutex_unlock(&xip->xi_lock); } +/* + * on diagnosing side, receive a uuclose from the proxy. + */ void fmd_xprt_event_uuclose(fmd_xprt_impl_t *xip, nvlist_t *nvl) { @@ -644,11 +614,77 @@ fmd_xprt_event_uuclose(fmd_xprt_impl_t *xip, nvlist_t *nvl) if (nvlist_lookup_string(nvl, FM_RSRC_XPRT_UUID, &uuid) == 0 && (cp = fmd_case_hash_lookup(fmd.d_cases, uuid)) != NULL) { + /* + * update resource cache status and transition case + */ + fmd_case_close_status(cp); fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_ISOLATED); fmd_case_rele(cp); } } +/* + * on diagnosing side, receive a uuresolved from the proxy. + */ +void +fmd_xprt_event_uuresolved(fmd_xprt_impl_t *xip, nvlist_t *nvl) +{ + fmd_case_t *cp; + char *uuid; + + if (fmd_xprt_vmismatch(xip, nvl, NULL)) + return; /* transitioned to error state */ + + if (nvlist_lookup_string(nvl, FM_RSRC_XPRT_UUID, &uuid) == 0 && + (cp = fmd_case_hash_lookup(fmd.d_cases, uuid)) != NULL) { + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + + fmd_case_transition(cp, (cip->ci_state == FMD_CASE_REPAIRED) ? + FMD_CASE_RESOLVED : (cip->ci_state == FMD_CASE_CLOSED) ? + FMD_CASE_REPAIRED : FMD_CASE_CLOSE_WAIT, FMD_CF_RESOLVED); + fmd_case_rele(cp); + } +} + +/* + * on diagnosing side, receive a repair/acquit from the proxy. + */ +void +fmd_xprt_event_updated(fmd_xprt_impl_t *xip, nvlist_t *nvl) +{ + fmd_case_t *cp; + char *uuid; + + if (fmd_xprt_vmismatch(xip, nvl, NULL)) + return; /* transitioned to error state */ + + if (nvlist_lookup_string(nvl, FM_RSRC_XPRT_UUID, &uuid) == 0 && + (cp = fmd_case_hash_lookup(fmd.d_cases, uuid)) != NULL) { + uint8_t *statusp, *proxy_asrup = NULL; + uint_t nelem = 0; + + /* + * Only update status with new repairs if "no remote repair" + * is not set. Do the case_update anyway though (as this will + * refresh the status on the proxy side). + */ + if (!(xip->xi_flags & FMD_XPRT_NO_REMOTE_REPAIR)) { + if (nvlist_lookup_uint8_array(nvl, + FM_RSRC_XPRT_FAULT_STATUS, &statusp, &nelem) == 0 && + nelem != 0) { + (void) nvlist_lookup_uint8_array(nvl, + FM_RSRC_XPRT_FAULT_HAS_ASRU, &proxy_asrup, + &nelem); + fmd_case_update_status(cp, statusp, + proxy_asrup, NULL); + } + fmd_case_update_containees(cp); + } + fmd_case_update(cp); + fmd_case_rele(cp); + } +} + void fmd_xprt_event_error(fmd_xprt_impl_t *xip, nvlist_t *nvl) { @@ -879,12 +915,13 @@ fmd_xprt_destroy(fmd_xprt_t *xp) /* * Release every case handle in the module that was cached by this * transport. This will result in these cases disappearing from the - * local case hash so that fmd_case_uuclose() can no longer be used. + * local case hash so that fmd_case_uuclose() and fmd_case_repaired() + * etc can no longer be used. */ for (cip = fmd_list_next(&mp->mod_cases); cip != NULL; cip = nip) { nip = fmd_list_next(cip); if (cip->ci_xprt == xp) - fmd_case_discard((fmd_case_t *)cip); + fmd_case_discard((fmd_case_t *)cip, B_TRUE); } /* @@ -998,6 +1035,351 @@ fmd_xprt_send(fmd_xprt_t *xp) } } +/* + * This function creates a local suspect list. This is used when a suspect list + * is created directly by an external source like fminject. + */ +static void +fmd_xprt_list_suspect_local(fmd_xprt_t *xp, nvlist_t *nvl) +{ + nvlist_t **nvlp; + nvlist_t *de_fmri, *de_fmri_dup = NULL; + int64_t *diag_time; + char *code = NULL; + fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)xp; + fmd_case_t *cp; + uint_t nelem = 0, nelem2 = 0, i; + + fmd_module_lock(xip->xi_queue->eq_mod); + cp = fmd_case_create(xip->xi_queue->eq_mod, NULL); + if (cp == NULL) { + fmd_module_unlock(xip->xi_queue->eq_mod); + return; + } + + /* + * copy diag_code if present + */ + (void) nvlist_lookup_string(nvl, FM_SUSPECT_DIAG_CODE, &code); + if (code != NULL) { + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + + cip->ci_precanned = 1; + fmd_case_setcode(cp, code); + } + + /* + * copy suspects + */ + (void) nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &nvlp, + &nelem); + for (i = 0; i < nelem; i++) { + nvlist_t *flt_copy, *asru = NULL, *fru = NULL, *rsrc = NULL; + topo_hdl_t *thp; + char *loc = NULL; + int err; + + thp = fmd_fmri_topo_hold(TOPO_VERSION); + (void) nvlist_xdup(nvlp[i], &flt_copy, &fmd.d_nva); + (void) nvlist_lookup_nvlist(nvlp[i], FM_FAULT_RESOURCE, &rsrc); + + /* + * If no fru specified, get it from topo + */ + if (nvlist_lookup_nvlist(nvlp[i], FM_FAULT_FRU, &fru) != 0 && + rsrc && topo_fmri_fru(thp, rsrc, &fru, &err) == 0) + (void) nvlist_add_nvlist(flt_copy, FM_FAULT_FRU, fru); + /* + * If no asru specified, get it from topo + */ + if (nvlist_lookup_nvlist(nvlp[i], FM_FAULT_ASRU, &asru) != 0 && + rsrc && topo_fmri_asru(thp, rsrc, &asru, &err) == 0) + (void) nvlist_add_nvlist(flt_copy, FM_FAULT_ASRU, asru); + /* + * If no location specified, get it from topo + */ + if (nvlist_lookup_string(nvlp[i], FM_FAULT_LOCATION, + &loc) != 0) { + if (fru && topo_fmri_label(thp, fru, &loc, &err) == 0) + (void) nvlist_add_string(flt_copy, + FM_FAULT_LOCATION, loc); + else if (rsrc && topo_fmri_label(thp, rsrc, &loc, + &err) == 0) + (void) nvlist_add_string(flt_copy, + FM_FAULT_LOCATION, loc); + if (loc) + topo_hdl_strfree(thp, loc); + } + if (fru) + nvlist_free(fru); + if (asru) + nvlist_free(asru); + if (rsrc) + nvlist_free(rsrc); + fmd_fmri_topo_rele(thp); + fmd_case_insert_suspect(cp, flt_copy); + } + + /* + * copy diag_time if present + */ + if (nvlist_lookup_int64_array(nvl, FM_SUSPECT_DIAG_TIME, &diag_time, + &nelem2) == 0 && nelem2 >= 2) + fmd_case_settime(cp, diag_time[0], diag_time[1]); + + /* + * copy DE fmri if present + */ + if (nvlist_lookup_nvlist(nvl, FM_SUSPECT_DE, &de_fmri) == 0) { + (void) nvlist_xdup(de_fmri, &de_fmri_dup, &fmd.d_nva); + fmd_case_set_de_fmri(cp, de_fmri_dup); + } + + fmd_case_transition(cp, FMD_CASE_SOLVED, FMD_CF_SOLVED); + fmd_module_unlock(xip->xi_queue->eq_mod); +} + +/* + * This function is called to create a proxy case on receipt of a list.suspect + * from the diagnosing side of the transport. + */ +static void +fmd_xprt_list_suspect(fmd_xprt_t *xp, nvlist_t *nvl) +{ + fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)xp; + nvlist_t **nvlp; + uint_t nelem = 0, nelem2 = 0, i; + int64_t *diag_time; + topo_hdl_t *thp; + char *class; + nvlist_t *rsrc, *asru, *de_fmri, *de_fmri_dup = NULL; + nvlist_t *flt_copy; + int err; + nvlist_t **asrua; + uint8_t *proxy_asru = NULL; + int got_proxy_asru = 0; + int got_hc_rsrc = 0; + int got_present_rsrc = 0; + uint8_t *diag_asru = NULL; + char *scheme; + uint8_t *statusp; + char *uuid, *code; + fmd_case_t *cp; + fmd_case_impl_t *cip; + int need_update = 0; + + if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0) + return; + if (nvlist_lookup_string(nvl, FM_SUSPECT_DIAG_CODE, &code) != 0) + return; + (void) nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &nvlp, + &nelem); + + /* + * In order to implement FMD_XPRT_HCONLY and FMD_XPRT_HC_PRESENT_ONLY + * etc we first scan the suspects to see if + * - there was an asru in the received fault + * - there was an hc-scheme resource in the received fault + * - any hc-scheme resource in the received fault is present in the + * local topology + * - any hc-scheme resource in the received fault has an asru in the + * local topology + */ + if (nelem > 0) { + asrua = fmd_zalloc(sizeof (nvlist_t *) * nelem, FMD_SLEEP); + proxy_asru = fmd_zalloc(sizeof (uint8_t) * nelem, FMD_SLEEP); + diag_asru = fmd_zalloc(sizeof (uint8_t) * nelem, FMD_SLEEP); + thp = fmd_fmri_topo_hold(TOPO_VERSION); + for (i = 0; i < nelem; i++) { + if (nvlist_lookup_nvlist(nvlp[i], FM_FAULT_ASRU, + &asru) == 0 && asru != NULL) + diag_asru[i] = 1; + if (nvlist_lookup_string(nvlp[i], FM_CLASS, + &class) != 0 || strncmp(class, "fault", 5) != 0) + continue; + /* + * If there is an hc-scheme asru, use that to find the + * real asru. Otherwise if there is an hc-scheme + * resource, work out the old asru from that. + * This order is to allow a two stage evaluation + * of the asru where a fault in the diagnosing side + * is in a component not visible to the proxy side, + * but prevents a component that is visible from + * working. So the diagnosing side sets the asru to + * the latter component (in hc-scheme as the diagnosing + * side doesn't know about the proxy side's virtual + * schemes), and then the proxy side can convert that + * to a suitable virtual scheme asru. + */ + if (nvlist_lookup_nvlist(nvlp[i], FM_FAULT_ASRU, + &asru) == 0 && asru != NULL && + nvlist_lookup_string(asru, FM_FMRI_SCHEME, + &scheme) == 0 && + strcmp(scheme, FM_FMRI_SCHEME_HC) == 0) { + got_hc_rsrc = 1; + if (xip->xi_flags & FMD_XPRT_EXTERNAL) + continue; + if (topo_fmri_present(thp, asru, &err) == 0) + got_present_rsrc = 1; + if (topo_fmri_asru(thp, asru, &asrua[i], + &err) == 0) { + proxy_asru[i] = + FMD_PROXY_ASRU_FROM_ASRU; + got_proxy_asru = 1; + } + } else if (nvlist_lookup_nvlist(nvlp[i], + FM_FAULT_RESOURCE, &rsrc) == 0 && rsrc != NULL && + nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, + &scheme) == 0 && + strcmp(scheme, FM_FMRI_SCHEME_HC) == 0) { + got_hc_rsrc = 1; + if (xip->xi_flags & FMD_XPRT_EXTERNAL) + continue; + if (topo_fmri_present(thp, rsrc, &err) == 0) + got_present_rsrc = 1; + if (topo_fmri_asru(thp, rsrc, &asrua[i], + &err) == 0) { + proxy_asru[i] = + FMD_PROXY_ASRU_FROM_RSRC; + got_proxy_asru = 1; + } + } + } + fmd_fmri_topo_rele(thp); + } + + /* + * If we're set up only to report hc-scheme faults, and + * there aren't any, then just drop the event. + */ + if (got_hc_rsrc == 0 && (xip->xi_flags & FMD_XPRT_HCONLY)) { + if (nelem > 0) { + fmd_free(proxy_asru, sizeof (uint8_t) * nelem); + fmd_free(diag_asru, sizeof (uint8_t) * nelem); + fmd_free(asrua, sizeof (nvlist_t *) * nelem); + } + return; + } + + /* + * If we're set up only to report locally present hc-scheme + * faults, and there aren't any, then just drop the event. + */ + if (got_present_rsrc == 0 && + (xip->xi_flags & FMD_XPRT_HC_PRESENT_ONLY)) { + if (nelem > 0) { + for (i = 0; i < nelem; i++) + if (asrua[i]) + nvlist_free(asrua[i]); + fmd_free(proxy_asru, sizeof (uint8_t) * nelem); + fmd_free(diag_asru, sizeof (uint8_t) * nelem); + fmd_free(asrua, sizeof (nvlist_t *) * nelem); + } + return; + } + + /* + * If fmd_case_recreate() returns NULL, UUID is already known. + */ + fmd_module_lock(xip->xi_queue->eq_mod); + if ((cp = fmd_case_recreate(xip->xi_queue->eq_mod, xp, + FMD_CASE_UNSOLVED, uuid, code)) == NULL) { + if (nelem > 0) { + for (i = 0; i < nelem; i++) + if (asrua[i]) + nvlist_free(asrua[i]); + fmd_free(proxy_asru, sizeof (uint8_t) * nelem); + fmd_free(diag_asru, sizeof (uint8_t) * nelem); + fmd_free(asrua, sizeof (nvlist_t *) * nelem); + } + fmd_module_unlock(xip->xi_queue->eq_mod); + return; + } + + cip = (fmd_case_impl_t *)cp; + cip->ci_diag_asru = diag_asru; + cip->ci_proxy_asru = proxy_asru; + for (i = 0; i < nelem; i++) { + (void) nvlist_xdup(nvlp[i], &flt_copy, &fmd.d_nva); + if (proxy_asru[i] != FMD_PROXY_ASRU_NOT_NEEDED) { + /* + * Copy suspects, but remove/replace asru first. Also if + * the original asru was hc-scheme use that as resource. + */ + if (proxy_asru[i] == FMD_PROXY_ASRU_FROM_ASRU) { + (void) nvlist_remove(flt_copy, + FM_FAULT_RESOURCE, DATA_TYPE_NVLIST); + (void) nvlist_lookup_nvlist(flt_copy, + FM_FAULT_ASRU, &asru); + (void) nvlist_add_nvlist(flt_copy, + FM_FAULT_RESOURCE, asru); + } + (void) nvlist_remove(flt_copy, FM_FAULT_ASRU, + DATA_TYPE_NVLIST); + (void) nvlist_add_nvlist(flt_copy, FM_FAULT_ASRU, + asrua[i]); + nvlist_free(asrua[i]); + } else if (nvlist_lookup_nvlist(flt_copy, FM_FAULT_ASRU, + &asru) == 0 && asru != NULL) { + /* + * keep asru from diag side, but but mark as no retire + */ + (void) nvlist_add_boolean_value(flt_copy, + FM_SUSPECT_RETIRE, B_FALSE); + } + fmd_case_insert_suspect(cp, flt_copy); + } + /* + * copy diag_time + */ + if (nvlist_lookup_int64_array(nvl, FM_SUSPECT_DIAG_TIME, &diag_time, + &nelem2) == 0 && nelem2 >= 2) + fmd_case_settime(cp, diag_time[0], diag_time[1]); + /* + * copy DE fmri + */ + if (nvlist_lookup_nvlist(nvl, FM_SUSPECT_DE, &de_fmri) == 0) { + (void) nvlist_xdup(de_fmri, &de_fmri_dup, &fmd.d_nva); + fmd_case_set_de_fmri(cp, de_fmri_dup); + } + + /* + * Transition to solved. This will log the suspect list and create + * the resource cache entries. + */ + fmd_case_transition(cp, FMD_CASE_SOLVED, FMD_CF_SOLVED); + + /* + * Update status if it is not simply "all faulty" (can happen if + * list.suspects are being re-sent when the transport has reconnected). + */ + (void) nvlist_lookup_uint8_array(nvl, FM_SUSPECT_FAULT_STATUS, &statusp, + &nelem); + for (i = 0; i < nelem; i++) { + if ((statusp[i] & (FM_SUSPECT_FAULTY | FM_SUSPECT_UNUSABLE | + FM_SUSPECT_NOT_PRESENT | FM_SUSPECT_DEGRADED)) != + FM_SUSPECT_FAULTY) + need_update = 1; + } + if (need_update) { + fmd_case_update_status(cp, statusp, cip->ci_proxy_asru, + cip->ci_diag_asru); + fmd_case_update_containees(cp); + fmd_case_update(cp); + } + + /* + * if asru on proxy side, send an update back to the diagnosing side to + * update UNUSABLE/DEGRADED. + */ + if (got_proxy_asru) + fmd_case_xprt_updated(cp); + + if (nelem > 0) + fmd_free(asrua, sizeof (nvlist_t *) * nelem); + fmd_module_unlock(xip->xi_queue->eq_mod); +} + void fmd_xprt_recv(fmd_xprt_t *xp, nvlist_t *nvl, hrtime_t hrt, boolean_t logonly) { @@ -1006,12 +1388,13 @@ fmd_xprt_recv(fmd_xprt_t *xp, nvlist_t *nvl, hrtime_t hrt, boolean_t logonly) fmd_t *dp = &fmd; fmd_event_t *e; - char *class, *uuid, *code; + char *class, *uuid; boolean_t isproto, isereport; uint64_t *tod; uint8_t ttl; uint_t n; + fmd_case_t *cp; /* * Grab the transport lock and set the busy flag to indicate we are @@ -1165,20 +1548,100 @@ fmd_xprt_recv(fmd_xprt_t *xp, nvlist_t *nvl, hrtime_t hrt, boolean_t logonly) /* * If a list.suspect event is received, create a case for the specified - * UUID in the case hash, with the transport module as its owner. If - * the UUID is already known, fmd_case_recreate() will return NULL and - * we simply proceed to our normal event handling regardless. + * UUID in the case hash, with the transport module as its owner. + */ + if (fmd_event_match(e, FMD_EVT_PROTOCOL, FM_LIST_SUSPECT_CLASS)) { + if (xip->xi_flags & FMD_XPRT_CACHE_AS_LOCAL) + fmd_xprt_list_suspect_local(xp, nvl); + else + fmd_xprt_list_suspect(xp, nvl); + fmd_event_hold(e); + fmd_event_rele(e); + goto done; + } + + /* + * If a list.updated or list.repaired event is received, update the + * resource cache status and the local case. + */ + if (fmd_event_match(e, FMD_EVT_PROTOCOL, FM_LIST_REPAIRED_CLASS) || + fmd_event_match(e, FMD_EVT_PROTOCOL, FM_LIST_UPDATED_CLASS)) { + uint8_t *statusp; + uint_t nelem = 0; + + (void) nvlist_lookup_uint8_array(nvl, FM_SUSPECT_FAULT_STATUS, + &statusp, &nelem); + fmd_module_lock(xip->xi_queue->eq_mod); + if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0 && + (cp = fmd_case_hash_lookup(fmd.d_cases, uuid)) != NULL) { + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + if (cip->ci_xprt != NULL) { + fmd_case_update_status(cp, statusp, + cip->ci_proxy_asru, cip->ci_diag_asru); + fmd_case_update_containees(cp); + fmd_case_update(cp); + } + fmd_case_rele(cp); + } + fmd_module_unlock(xip->xi_queue->eq_mod); + fmd_event_hold(e); + fmd_event_rele(e); + goto done; + } + + /* + * If a list.isolated event is received, update resource cache status + */ + if (fmd_event_match(e, FMD_EVT_PROTOCOL, FM_LIST_ISOLATED_CLASS)) { + uint8_t *statusp; + uint_t nelem = 0; + + (void) nvlist_lookup_uint8_array(nvl, FM_SUSPECT_FAULT_STATUS, + &statusp, &nelem); + fmd_module_lock(xip->xi_queue->eq_mod); + if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0 && + (cp = fmd_case_hash_lookup(fmd.d_cases, uuid)) != NULL) { + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + if (cip->ci_xprt != NULL) + fmd_case_update_status(cp, statusp, + cip->ci_proxy_asru, cip->ci_diag_asru); + fmd_case_rele(cp); + } + fmd_module_unlock(xip->xi_queue->eq_mod); + fmd_event_hold(e); + fmd_event_rele(e); + goto done; + } + + /* + * If a list.resolved event is received, resolve the local case. */ - if (fmd_event_match(e, FMD_EVT_PROTOCOL, FM_LIST_SUSPECT_CLASS) && - nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0 && - nvlist_lookup_string(nvl, FM_SUSPECT_DIAG_CODE, &code) == 0) { + if (fmd_event_match(e, FMD_EVT_PROTOCOL, FM_LIST_RESOLVED_CLASS)) { fmd_module_lock(xip->xi_queue->eq_mod); - (void) fmd_case_recreate(xip->xi_queue->eq_mod, - xp, FMD_CASE_SOLVED, uuid, code); + if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0 && + (cp = fmd_case_hash_lookup(fmd.d_cases, uuid)) != NULL) { + fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; + if (cip->ci_xprt != NULL) + fmd_case_transition(cp, (cip->ci_state == + FMD_CASE_REPAIRED) ? FMD_CASE_RESOLVED : + (cip->ci_state == FMD_CASE_CLOSED) ? + FMD_CASE_REPAIRED : FMD_CASE_CLOSE_WAIT, + FMD_CF_RESOLVED); + fmd_case_rele(cp); + } fmd_module_unlock(xip->xi_queue->eq_mod); + fmd_event_hold(e); + fmd_event_rele(e); + goto done; } - if (logonly == FMD_B_TRUE) { + if (logonly == FMD_B_TRUE || (xip->xi_flags & FMD_XPRT_EXTERNAL)) { + /* + * Don't proxy ereports on an EXTERNAL transport - we won't + * know how to diagnose them with the wrong topology. Note + * that here (and above) we have to hold/release the event in + * order for it to be freed. + */ fmd_event_hold(e); fmd_event_rele(e); } else if (isproto == FMD_B_TRUE) @@ -1204,8 +1667,10 @@ fmd_xprt_uuclose(fmd_xprt_t *xp, const char *uuid) nvlist_t *nvl; char *s; - fmd_dprintf(FMD_DBG_XPRT, - "xprt %u closing case %s\n", xip->xi_id, uuid); + if ((xip->xi_flags & FMD_XPRT_RDWR) == FMD_XPRT_RDONLY) + return; /* read-only transports do not proxy uuclose */ + + TRACE((FMD_DBG_XPRT, "xprt %u closing case %s\n", xip->xi_id, uuid)); nvl = fmd_protocol_xprt_uuclose(xip->xi_queue->eq_mod, "resource.fm.xprt.uuclose", xip->xi_version, uuid); @@ -1216,6 +1681,58 @@ fmd_xprt_uuclose(fmd_xprt_t *xp, const char *uuid) } /* + * On proxy side, send back uuresolved request to diagnosing side + */ +void +fmd_xprt_uuresolved(fmd_xprt_t *xp, const char *uuid) +{ + fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)xp; + + fmd_event_t *e; + nvlist_t *nvl; + char *s; + + if ((xip->xi_flags & FMD_XPRT_RDWR) == FMD_XPRT_RDONLY) + return; /* read-only transports do not proxy uuresolved */ + + TRACE((FMD_DBG_XPRT, "xprt %u resolving case %s\n", xip->xi_id, uuid)); + + nvl = fmd_protocol_xprt_uuresolved(xip->xi_queue->eq_mod, + "resource.fm.xprt.uuresolved", xip->xi_version, uuid); + + (void) nvlist_lookup_string(nvl, FM_CLASS, &s); + e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, s); + fmd_eventq_insert_at_time(xip->xi_queue, e); +} + +/* + * On proxy side, send back repair/acquit/etc request to diagnosing side + */ +void +fmd_xprt_updated(fmd_xprt_t *xp, const char *uuid, uint8_t *statusp, + uint8_t *has_asrup, uint_t nelem) +{ + fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)xp; + + fmd_event_t *e; + nvlist_t *nvl; + char *s; + + if ((xip->xi_flags & FMD_XPRT_RDWR) == FMD_XPRT_RDONLY) + return; /* read-only transports do not support remote repairs */ + + TRACE((FMD_DBG_XPRT, "xprt %u updating case %s\n", xip->xi_id, uuid)); + + nvl = fmd_protocol_xprt_updated(xip->xi_queue->eq_mod, + "resource.fm.xprt.updated", xip->xi_version, uuid, statusp, + has_asrup, nelem); + + (void) nvlist_lookup_string(nvl, FM_CLASS, &s); + e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, s); + fmd_eventq_insert_at_time(xip->xi_queue, e); +} + +/* * Insert the specified class into our remote subscription hash. If the class * is already present, bump the reference count; otherwise add it to the hash * and then enqueue an event for our remote peer to proxy our subscription. diff --git a/usr/src/cmd/fm/fmd/common/fmd_xprt.h b/usr/src/cmd/fm/fmd/common/fmd_xprt.h index 41054fdc35..2aae76d7b9 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_xprt.h +++ b/usr/src/cmd/fm/fmd/common/fmd_xprt.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,6 +56,8 @@ extern fmd_xprt_rule_f fmd_xprt_event_unsuback; extern fmd_xprt_rule_f fmd_xprt_event_uuclose; extern fmd_xprt_rule_f fmd_xprt_event_error; extern fmd_xprt_rule_f fmd_xprt_event_drop; +extern fmd_xprt_rule_f fmd_xprt_event_uuresolved; +extern fmd_xprt_rule_f fmd_xprt_event_updated; typedef struct fmd_xprt_rule { const char *xr_class; /* pattern to match */ @@ -123,11 +125,19 @@ typedef struct fmd_xprt_impl { #define FMD_XPRT_RDWR 0x3 /* xprt is read-write */ #define FMD_XPRT_ACCEPT 0x4 /* xprt is accepting connection */ #define FMD_XPRT_SUSPENDED 0x8 /* xprt is suspended by user */ -#define FMD_XPRT_CMASK 0xF /* xprt create flag mask */ #define FMD_XPRT_SUBSCRIBER 0x10 /* xprt is actively subscribing */ #define FMD_XPRT_ISUSPENDED 0x20 /* xprt is waiting for _fmd_init */ #define FMD_XPRT_DSUSPENDED 0x40 /* xprt is suspended by fmd mechanism */ - +#define FMD_XPRT_EXTERNAL 0x80 /* xprt is external to a chassis */ +#define FMD_XPRT_NO_REMOTE_REPAIR 0x100 /* xprt allows remote repair */ +#define FMD_XPRT_CACHE_AS_LOCAL 0x200 /* xprt caches fault as if local */ +#define FMD_XPRT_HCONLY 0x400 /* xprt only proxies hc-scheme faults */ +#define FMD_XPRT_HC_PRESENT_ONLY 0x800 /* only locally present hc faults */ + +#define FMD_XPRT_CMASK /* xprt create flag mask */ \ + (FMD_XPRT_RDWR | FMD_XPRT_ACCEPT | FMD_XPRT_SUSPENDED | \ + FMD_XPRT_EXTERNAL | FMD_XPRT_NO_REMOTE_REPAIR | \ + FMD_XPRT_CACHE_AS_LOCAL | FMD_XPRT_HCONLY | FMD_XPRT_HC_PRESENT_ONLY) #define FMD_XPRT_SMASK \ (FMD_XPRT_SUSPENDED | FMD_XPRT_ISUSPENDED | FMD_XPRT_DSUSPENDED) @@ -138,6 +148,9 @@ extern void fmd_xprt_xresume(fmd_xprt_t *, uint_t); extern void fmd_xprt_send(fmd_xprt_t *); extern void fmd_xprt_recv(fmd_xprt_t *, nvlist_t *, hrtime_t, boolean_t); extern void fmd_xprt_uuclose(fmd_xprt_t *, const char *); +extern void fmd_xprt_uuresolved(fmd_xprt_t *, const char *); +extern void fmd_xprt_updated(fmd_xprt_t *, const char *, uint8_t *, uint8_t *, + uint_t); extern void fmd_xprt_subscribe(fmd_xprt_t *, const char *); extern void fmd_xprt_unsubscribe(fmd_xprt_t *, const char *); diff --git a/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c b/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c index 3c6c172e58..32e9c6504c 100644 --- a/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c +++ b/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c @@ -493,6 +493,9 @@ cma_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) { fmd_hdl_debug(hdl, "received %s\n", class); + if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) + return; + if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 || strcmp(class, FM_LIST_UPDATED_CLASS) == 0) diff --git a/usr/src/cmd/fm/modules/common/disk-monitor/disk_monitor.c b/usr/src/cmd/fm/modules/common/disk-monitor/disk_monitor.c index 7e12e7abf6..167873cd8b 100644 --- a/usr/src/cmd/fm/modules/common/disk-monitor/disk_monitor.c +++ b/usr/src/cmd/fm/modules/common/disk-monitor/disk_monitor.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Disk Monitor */ @@ -284,6 +282,8 @@ diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) diskmon_agent_suspect(hdl, nvl); return; + } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) { + return; } /* diff --git a/usr/src/cmd/fm/modules/common/io-retire/rio_main.c b/usr/src/cmd/fm/modules/common/io-retire/rio_main.c index 0dfd1415ba..c5953a70cb 100644 --- a/usr/src/cmd/fm/modules/common/io-retire/rio_main.c +++ b/usr/src/cmd/fm/modules/common/io-retire/rio_main.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/fm/protocol.h> #include <fm/fmd_api.h> #include <strings.h> @@ -132,6 +130,7 @@ rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) di_retire_t drt = {0}; int retire; int rval = 0; + int valid_suspect = 0; int error; char *snglfault = FM_FAULT_CLASS"."FM_ERROR_IO"."; boolean_t rtr; @@ -155,6 +154,8 @@ rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) retire = 0; } else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) { retire = 0; + } else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) { + return; } else if (strncmp(class, snglfault, strlen(snglfault)) == 0) { retire = 1; faults = &nvl; @@ -200,6 +201,7 @@ rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) continue; } + valid_suspect = 1; if (retire) { if (fmd_nvl_fmri_has_fault(hdl, asru, FMD_HAS_FAULT_ASRU, NULL) == 1) { @@ -226,6 +228,13 @@ rio_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) } /* + * Don't send uuclose or uuresolved unless at least one suspect + * was valid for this retire agent and no retires/unretires failed. + */ + if (valid_suspect == 0) + return; + + /* * The fmd framework takes care of moving a case to the repaired * state. To move the case to the closed state however, we (the * retire agent) need to call fmd_case_uuclose() diff --git a/usr/src/cmd/fm/modules/common/ip-transport/ip.c b/usr/src/cmd/fm/modules/common/ip-transport/ip.c index a70d62e883..8a5b500b9f 100644 --- a/usr/src/cmd/fm/modules/common/ip-transport/ip.c +++ b/usr/src/cmd/fm/modules/common/ip-transport/ip.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/socket.h> #include <sys/sysmacros.h> @@ -94,6 +92,12 @@ static size_t ip_size; /* default buffer size */ static volatile int ip_quit; /* signal to quit */ static int ip_qlen; /* queue length for listen(3SOCKET) */ static int ip_mtbf; /* mtbf for simulating packet drop */ +static int ip_external; /* set transport to be "external" */ +static int ip_no_remote_repair; /* disallow remote repair */ +static int ip_hconly; /* only cache faults that are hc-scheme */ +static int ip_rdonly; /* force transport to be rdonly */ +static int ip_hc_present_only; /* only cache faults if hc-scheme and present */ +static char *ip_domain_name; /* set domain name for received list.suspects */ static hrtime_t ip_burp; /* make mtbf slower by adding this much delay */ static int ip_translate; /* call fmd_xprt_translate() before sending */ static char *ip_host; /* host to connect to (or NULL if server) */ @@ -323,8 +327,11 @@ ip_xprt_recv_event(ip_xprt_t *ipx) fmd_hdl_error(ip_hdl, "failed to unpack event from " "transport %p: %s\n", (void *)ipx->ipx_xprt, strerror(err)); ip_stat.ips_unpackfail.fmds_value.ui64++; - } else + } else { + if (ip_domain_name) + fmd_xprt_add_domain(ip_hdl, nvl, ip_domain_name); fmd_xprt_post(ip_hdl, ipx->ipx_xprt, nvl, 0); + } if (fmd_xprt_error(ip_hdl, ipx->ipx_xprt)) { fmd_hdl_error(ip_hdl, "protocol error on transport %p", @@ -466,10 +473,26 @@ ip_xprt_setup(fmd_hdl_t *hdl) struct addrinfo *aip; const char *s1, *s2; + /* + * Set up flags as specified in the .conf file. Note that these are + * mostly only used for testing purposes, allowing the transport to + * be set up in various modes. + */ if (ip_host != NULL) - xflags = FMD_XPRT_RDWR; + xflags = (ip_rdonly == FMD_B_TRUE) ? FMD_XPRT_RDONLY : + FMD_XPRT_RDWR; else - xflags = FMD_XPRT_RDWR | FMD_XPRT_ACCEPT; + xflags = ((ip_rdonly == FMD_B_TRUE) ? FMD_XPRT_RDONLY : + FMD_XPRT_RDWR) | FMD_XPRT_ACCEPT; + + if (ip_external == FMD_B_TRUE) + xflags |= FMD_XPRT_EXTERNAL; + if (ip_no_remote_repair == FMD_B_TRUE) + xflags |= FMD_XPRT_NO_REMOTE_REPAIR; + if (ip_hconly == FMD_B_TRUE) + xflags |= FMD_XPRT_HCONLY; + if (ip_hc_present_only == FMD_B_TRUE) + xflags |= FMD_XPRT_HC_PRESENT_ONLY; for (aip = ip_ail; aip != NULL; aip = aip->ai_next) { if (aip->ai_family != AF_INET && aip->ai_family != AF_INET6) @@ -554,6 +577,12 @@ static const fmd_prop_t fmd_props[] = { { "ip_burp", FMD_TYPE_TIME, "0" }, { "ip_enable", FMD_TYPE_BOOL, "false" }, { "ip_mtbf", FMD_TYPE_INT32, "0" }, + { "ip_external", FMD_TYPE_BOOL, "true" }, + { "ip_no_remote_repair", FMD_TYPE_BOOL, "true" }, + { "ip_hconly", FMD_TYPE_BOOL, "false" }, + { "ip_rdonly", FMD_TYPE_BOOL, "false" }, + { "ip_hc_present_only", FMD_TYPE_BOOL, "false" }, + { "ip_domain_name", FMD_TYPE_STRING, NULL }, { "ip_port", FMD_TYPE_STRING, "664" }, { "ip_qlen", FMD_TYPE_INT32, "32" }, { "ip_retry", FMD_TYPE_UINT32, "50" }, @@ -614,6 +643,12 @@ _fmd_init(fmd_hdl_t *hdl) ip_burp = fmd_prop_get_int64(hdl, "ip_burp"); ip_mtbf = fmd_prop_get_int32(hdl, "ip_mtbf"); + ip_external = fmd_prop_get_int32(hdl, "ip_external"); + ip_no_remote_repair = fmd_prop_get_int32(hdl, "ip_no_remote_repair"); + ip_hconly = fmd_prop_get_int32(hdl, "ip_hconly"); + ip_rdonly = fmd_prop_get_int32(hdl, "ip_rdonly"); + ip_hc_present_only = fmd_prop_get_int32(hdl, "ip_hc_present_only"); + ip_domain_name = fmd_prop_get_string(hdl, "ip_domain_name"); ip_qlen = fmd_prop_get_int32(hdl, "ip_qlen"); ip_retry = fmd_prop_get_int32(hdl, "ip_retry"); ip_sleep = fmd_prop_get_int64(hdl, "ip_sleep"); diff --git a/usr/src/cmd/fm/modules/common/syslog-msgs/syslog.c b/usr/src/cmd/fm/modules/common/syslog-msgs/syslog.c index 08c421915c..8bacd0783a 100644 --- a/usr/src/cmd/fm/modules/common/syslog-msgs/syslog.c +++ b/usr/src/cmd/fm/modules/common/syslog-msgs/syslog.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/fm/protocol.h> #include <sys/strlog.h> #include <sys/log.h> @@ -244,6 +242,13 @@ syslog_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) int locale_c = 0; size_t len; + /* + * don't log updated and isolated events (for now) + */ + if (strcmp(class, FM_LIST_ISOLATED_CLASS) == 0 || + strcmp(class, FM_LIST_UPDATED_CLASS) == 0) + return; + if (nvlist_lookup_uint8(nvl, FM_VERSION, &version) != 0 || version > FM_SUSPECT_VERSION) { fmd_hdl_debug(hdl, "invalid event version: %u\n", version); @@ -549,6 +554,8 @@ _fmd_init(fmd_hdl_t *hdl) fmd_prop_free_string(hdl, rootdir); fmd_hdl_subscribe(hdl, FM_LIST_SUSPECT_CLASS); + fmd_hdl_subscribe(hdl, FM_LIST_UPDATED_CLASS); + fmd_hdl_subscribe(hdl, FM_LIST_ISOLATED_CLASS); fmd_hdl_subscribe(hdl, FM_LIST_REPAIRED_CLASS); fmd_hdl_subscribe(hdl, FM_LIST_RESOLVED_CLASS); } diff --git a/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c b/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c index 72535443d9..3f0a6eee43 100644 --- a/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c +++ b/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * The ZFS retire agent is responsible for managing hot spares across all pools. * When we see a device fault or a device removal, we try to open the associated @@ -211,6 +209,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, nvlist_t *vdev; char *uuid; int repair_done = 0; + boolean_t retire; /* * If this is a resource notifying us of device removal, then simply @@ -233,6 +232,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, return; } + if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) + return; + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) is_repair = B_TRUE; else @@ -251,6 +253,10 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, fault_device = B_FALSE; degrade_device = B_FALSE; + if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, + &retire) == 0 && retire == 0) + continue; + /* * While we subscribe to fault.fs.zfs.*, we only take action * for faults targeting a specific vdev (open failure or SERD diff --git a/usr/src/uts/common/sys/fm/protocol.h b/usr/src/uts/common/sys/fm/protocol.h index cb05dd7439..b79d244692 100644 --- a/usr/src/uts/common/sys/fm/protocol.h +++ b/usr/src/uts/common/sys/fm/protocol.h @@ -129,6 +129,8 @@ extern "C" { #define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0 #define FM_RSRC_XPRT_UUID "uuid" #define FM_RSRC_XPRT_SUBCLASS "subclass" +#define FM_RSRC_XPRT_FAULT_STATUS "fault-status" +#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru" /* * FM ENA Format Macros |