summaryrefslogtreecommitdiff
path: root/usr/src/cmd/fm
diff options
context:
space:
mode:
authorTrang Do <Trang.Do@Sun.COM>2010-05-21 12:50:28 -0700
committerTrang Do <Trang.Do@Sun.COM>2010-05-21 12:50:28 -0700
commitd4ac42a1cd3016618a9ba0330862d410f0058f89 (patch)
treed8f26e29ceaa225494cd417739ccfc4760dd340e /usr/src/cmd/fm
parentc42520eba4ad2249406ee84492401db194e6104e (diff)
downloadillumos-gate-d4ac42a1cd3016618a9ba0330862d410f0058f89.tar.gz
6648030 Default threshold for faulting a DIMM not appropriate for large memory installations
6875817 FMA does not implement DIMM Replacement Policy Rule #5 correctly
Diffstat (limited to 'usr/src/cmd/fm')
-rw-r--r--usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c9
-rw-r--r--usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd.h9
-rw-r--r--usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c91
-rw-r--r--usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.h36
-rw-r--r--usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_main.c12
-rw-r--r--usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.h9
-rw-r--r--usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_memerr.c360
-rw-r--r--usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_memerr_arch.c301
-rw-r--r--usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_memerr_arch.c79
-rw-r--r--usr/src/cmd/fm/modules/sun4v/generic-mem/gmem.h7
-rw-r--r--usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.c151
-rw-r--r--usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.h33
-rw-r--r--usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_main.c11
-rw-r--r--usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_mem.h4
-rw-r--r--usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_memerr.c488
-rw-r--r--usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_page.c5
16 files changed, 1462 insertions, 143 deletions
diff --git a/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c b/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c
index 4c6956bd16..62db5bff72 100644
--- a/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c
+++ b/usr/src/cmd/fm/modules/common/cpumem-retire/cma_main.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <cma.h>
@@ -224,6 +223,8 @@ static const cma_subscriber_t cma_subrs[] = {
NULL },
{ "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
NULL },
+ { "fault.memory.datapath", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION,
+ NULL },
{ "fault.memory.link-c", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
NULL },
{ "fault.memory.link-u", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
@@ -341,6 +342,10 @@ static const cma_subscriber_t cma_subrs[] = {
NULL },
{ "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION,
NULL },
+ { "fault.memory.datapath", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION,
+ NULL },
+ { "fault.memory.datapath", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION,
+ NULL },
/*
* The following faults do NOT retire a cpu thread,
diff --git a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd.h b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd.h
index 3997ca9284..f0f99b0f6c 100644
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd.h
+++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _CMD_H
#define _CMD_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdarg.h>
#include <fm/fmd_api.h>
#include <sys/param.h>
@@ -249,6 +246,10 @@ typedef struct cmd {
cmd_serd_t cmd_miscregs_serd; /* params for misregs serd */
cmd_serd_t cmd_dcache_serd; /* params for dcache serd */
cmd_serd_t cmd_icache_serd; /* params for icache serd */
+ uint32_t cmd_low_ce_thresh; /* low ce thershold */
+ uint32_t cmd_hi_ce_thresh; /* hi ce threshold */
+ uint32_t cmd_dupce; /* max 5b CEs */
+ uint32_t cmd_nupos; /* min number of equal upos */
#ifdef sun4u
uint16_t cmd_dp_flag; /* datapath error in progress if set */
#endif
diff --git a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c
index 338f066da3..e7646bffa2 100644
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c
+++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -28,6 +27,7 @@
*/
#include <cmd_mem.h>
+#include <limits.h>
#include <cmd_dimm.h>
#include <cmd_bank.h>
#include <cmd.h>
@@ -111,6 +111,7 @@ cmd_dimm_free(fmd_hdl_t *hdl, cmd_dimm_t *dimm, int destroy)
cmd_case_t *cc = &dimm->dimm_case;
int i;
cmd_mq_t *q;
+ tstamp_t *tsp, *next;
#ifdef sun4v
cmd_branch_t *branch;
@@ -134,6 +135,15 @@ cmd_dimm_free(fmd_hdl_t *hdl, cmd_dimm_t *dimm, int destroy)
fmd_hdl_strfree(hdl, q->mq_serdnm);
q->mq_serdnm = NULL;
}
+
+ for (tsp = cmd_list_next(&q->mq_dupce_tstamp);
+ tsp != NULL; tsp = next) {
+ next = cmd_list_next(tsp);
+ cmd_list_delete(&q->mq_dupce_tstamp,
+ &tsp->ts_l);
+ fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
+ }
+
cmd_list_delete(&dimm->mq_root[i], q);
fmd_hdl_free(hdl, q, sizeof (cmd_mq_t));
}
@@ -235,6 +245,9 @@ cmd_dimm_create(fmd_hdl_t *hdl, nvlist_t *asru)
dimm = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
dimm->dimm_nodetype = CMD_NT_DIMM;
dimm->dimm_version = CMD_DIMM_VERSION;
+ dimm->dimm_phys_addr_low = ULLONG_MAX;
+ dimm->dimm_phys_addr_hi = 0;
+ dimm->dimm_syl_error = USHRT_MAX;
cmd_bufname(dimm->dimm_bufname, sizeof (dimm->dimm_bufname), "dimm_%s",
unum);
@@ -288,7 +301,7 @@ cmd_dimm_lookup(fmd_hdl_t *hdl, nvlist_t *asru)
}
static cmd_dimm_t *
-dimm_v0tov1(fmd_hdl_t *hdl, cmd_dimm_0_t *old, size_t oldsz)
+dimm_v0tov2(fmd_hdl_t *hdl, cmd_dimm_0_t *old, size_t oldsz)
{
cmd_dimm_t *new;
@@ -302,13 +315,40 @@ dimm_v0tov1(fmd_hdl_t *hdl, cmd_dimm_0_t *old, size_t oldsz)
new->dimm_version = CMD_DIMM_VERSION;
new->dimm_asru = old->dimm0_asru;
new->dimm_nretired = old->dimm0_nretired;
+ new->dimm_phys_addr_hi = 0;
+ new->dimm_phys_addr_low = ULLONG_MAX;
fmd_hdl_free(hdl, old, oldsz);
return (new);
}
static cmd_dimm_t *
-dimm_wrapv1(fmd_hdl_t *hdl, cmd_dimm_pers_t *pers, size_t psz)
+dimm_v1tov2(fmd_hdl_t *hdl, cmd_dimm_1_t *old, size_t oldsz)
+{
+
+ cmd_dimm_t *new;
+
+ if (oldsz != sizeof (cmd_dimm_1_t)) {
+ fmd_hdl_abort(hdl, "size of state doesn't match size of "
+ "version 1 state (%u bytes).\n", sizeof (cmd_dimm_1_t));
+ }
+
+ new = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
+
+ new->dimm_header = old->dimm1_header;
+ new->dimm_version = CMD_DIMM_VERSION;
+ new->dimm_asru = old->dimm1_asru;
+ new->dimm_nretired = old->dimm1_nretired;
+ new->dimm_flags = old->dimm1_flags;
+ new->dimm_phys_addr_hi = 0;
+ new->dimm_phys_addr_low = ULLONG_MAX;
+
+ fmd_hdl_free(hdl, old, oldsz);
+ return (new);
+}
+
+static cmd_dimm_t *
+dimm_wrapv2(fmd_hdl_t *hdl, cmd_dimm_pers_t *pers, size_t psz)
{
cmd_dimm_t *dimm;
@@ -346,7 +386,8 @@ cmd_dimm_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
fmd_case_uuid(hdl, cp));
} else if (dimmsz > CMD_DIMM_MAXSIZE ||
dimmsz < CMD_DIMM_MINSIZE) {
- fmd_hdl_abort(hdl, "dimm buffer referenced by case %s "
+ fmd_hdl_abort(hdl,
+ "dimm buffer referenced by case %s "
"is out of bounds (is %u bytes, max %u, min %u)\n",
fmd_case_uuid(hdl, cp), dimmsz,
CMD_DIMM_MAXSIZE, CMD_DIMM_MINSIZE);
@@ -364,7 +405,11 @@ cmd_dimm_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
if (CMD_DIMM_VERSIONED(dimm)) {
switch (dimm->dimm_version) {
case CMD_DIMM_VERSION_1:
- dimm = dimm_wrapv1(hdl, (cmd_dimm_pers_t *)dimm,
+ dimm = dimm_v1tov2(hdl, (cmd_dimm_1_t *)dimm,
+ dimmsz);
+ break;
+ case CMD_DIMM_VERSION_2:
+ dimm = dimm_wrapv2(hdl, (cmd_dimm_pers_t *)dimm,
dimmsz);
break;
default:
@@ -374,7 +419,7 @@ cmd_dimm_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
break;
}
} else {
- dimm = dimm_v0tov1(hdl, (cmd_dimm_0_t *)dimm, dimmsz);
+ dimm = dimm_v0tov2(hdl, (cmd_dimm_0_t *)dimm, dimmsz);
migrated = 1;
}
@@ -453,3 +498,35 @@ cmd_dimm_fini(fmd_hdl_t *hdl)
while ((dimm = cmd_list_next(&cmd.cmd_dimms)) != NULL)
cmd_dimm_free(hdl, dimm, FMD_B_FALSE);
}
+
+
+void
+cmd_dimm_save_symbol_error(cmd_dimm_t *dimm, uint16_t upos)
+{
+ cmd_dimm_t *d = NULL, *next = NULL;
+
+ for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
+ next = cmd_list_next(d);
+ if (cmd_same_datapath_dimms(dimm, d))
+ d->dimm_syl_error = upos;
+ }
+}
+
+int
+cmd_dimm_check_symbol_error(cmd_dimm_t *dimm, uint16_t synd)
+{
+ int upos;
+ cmd_dimm_t *d, *next;
+
+ if ((upos = cmd_synd2upos(synd)) < 0)
+ return (0);
+
+ for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
+ next = cmd_list_next(d);
+ if (cmd_same_datapath_dimms(dimm, d) &&
+ (d->dimm_syl_error == upos))
+ return (1);
+ }
+
+ return (0);
+}
diff --git a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.h b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.h
index c4f28c9d6b..c2803c9366 100644
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.h
+++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.h
@@ -80,7 +80,8 @@ extern "C" {
#define DIMM_MKVERSION(version) ((version) << 4 | 1)
#define CMD_DIMM_VERSION_1 DIMM_MKVERSION(1) /* 17 */
-#define CMD_DIMM_VERSION CMD_DIMM_VERSION_1
+#define CMD_DIMM_VERSION_2 DIMM_MKVERSION(2) /* 33 */
+#define CMD_DIMM_VERSION CMD_DIMM_VERSION_2
#define CMD_DIMM_VERSIONED(dimm) ((dimm)->dimm_version & 1)
@@ -96,12 +97,23 @@ typedef struct cmd_dimm_0 {
cmd_bank_t *dimm0_bank; /* This DIMM's bank (if discovered) */
} cmd_dimm_0_t;
+typedef struct cmd_dimm_1 {
+ cmd_header_t dimm1_header; /* Nodetype must be CMD_NT_DIMM */
+ uint_t dimm1_version; /* DIMM version */
+ cmd_fmri_t dimm1_asru; /* ASRU for this DIMM */
+ uint_t dimm1_flags; /* CMD_MEM_F_* */
+ uint_t dimm1_nretired; /* # ret'd pages for CEs in DIMM */
+} cmd_dimm_1_t;
+
+
typedef struct cmd_dimm_pers {
cmd_header_t dimmp_header; /* Nodetype must be CMD_NT_DIMM */
uint_t dimmp_version;
cmd_fmri_t dimmp_asru; /* ASRU for this DIMM */
uint_t dimmp_flags; /* CMD_MEM_F_* */
uint_t dimmp_nretired; /* # ret'd pages for CEs in DIMM */
+ uint64_t dimmp_phys_addr_low; /* retired pages low addr */
+ uint64_t dimmp_phys_addr_hi; /* retired pages hi addr */
} cmd_dimm_pers_t;
/*
@@ -118,14 +130,23 @@ typedef struct cmd_mq {
uint16_t mq_unit_position; /* bit for sun4u, nibble for sun4v */
fmd_event_t *mq_ep; /* ereport - for potential fault */
char *mq_serdnm; /* serd eng to retain CE events */
+ uint16_t mq_dupce_count; /* dup CEs */
+ cmd_list_t mq_dupce_tstamp; /* list of dup CEs time stamp */
+ uint32_t mq_cpuid; /* ereport detector */
} cmd_mq_t;
+typedef struct tstamp {
+ cmd_list_t ts_l;
+ uint64_t tstamp;
+} tstamp_t;
+
struct cmd_dimm {
cmd_dimm_pers_t dimm_pers;
cmd_bank_t *dimm_bank; /* This DIMM's bank (if discovered) */
const char *dimm_unum; /* This DIMM's name */
cmd_case_t dimm_case; /* Open CE case against this DIMM */
fmd_stat_t dimm_retstat; /* retirement statistics, this DIMM */
+ uint16_t dimm_syl_error; /* bad r/w symbol-in-error */
cmd_list_t
mq_root[CMD_MAX_CKWDS]; /* per-checkword CEs to correlate */
};
@@ -133,11 +154,16 @@ struct cmd_dimm {
#define CMD_MQ_TIMELIM (72*60*60) /* 72 hours */
#define CMD_MQ_SERDT MAXINT /* Never expected to fire */
#define CMD_MQ_SERDN 2 /* Dup CEs not allowed */
+#define CMD_MQ_512KB 0x80000 /* space between low & hi retired */
+ /* page addrss */
+#define CMD_PAGE_RATIO 0.0625 /* bad r/w page ratio (1/16) */
#define CMD_DIMM_MAXSIZE \
- MAX(sizeof (cmd_dimm_0_t), sizeof (cmd_dimm_pers_t))
+ MAX(MAX(sizeof (cmd_dimm_0_t), sizeof (cmd_dimm_pers_t)), \
+ MAX(sizeof (cmd_dimm_1_t), sizeof (cmd_dimm_pers_t)))
#define CMD_DIMM_MINSIZE \
- MIN(sizeof (cmd_dimm_0_t), sizeof (cmd_dimm_pers_t))
+ MIN(MIN(sizeof (cmd_dimm_0_t), sizeof (cmd_dimm_pers_t)), \
+ MIN(sizeof (cmd_dimm_1_t), sizeof (cmd_dimm_pers_t)))
#define dimm_header dimm_pers.dimmp_header
#define dimm_nodetype dimm_pers.dimmp_header.hdr_nodetype
@@ -147,6 +173,8 @@ struct cmd_dimm {
#define dimm_asru_nvl dimm_pers.dimmp_asru.fmri_nvl
#define dimm_flags dimm_pers.dimmp_flags
#define dimm_nretired dimm_pers.dimmp_nretired
+#define dimm_phys_addr_hi dimm_pers.dimmp_phys_addr_hi
+#define dimm_phys_addr_low dimm_pers.dimmp_phys_addr_low
extern cmd_dimm_t *cmd_dimm_lookup(fmd_hdl_t *, nvlist_t *);
extern cmd_dimm_t *cmd_dimm_create(fmd_hdl_t *, nvlist_t *);
@@ -171,6 +199,8 @@ extern void cmd_dimm_gc(fmd_hdl_t *);
extern void cmd_dimm_fini(fmd_hdl_t *);
extern void cmd_dimmlist_free(fmd_hdl_t *);
+extern void cmd_dimm_save_symbol_error(cmd_dimm_t *, uint16_t);
+extern int cmd_dimm_check_symbol_error(cmd_dimm_t *, uint16_t);
#ifdef __cplusplus
}
diff --git a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_main.c b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_main.c
index ec56d7b7e0..dd2f5ea9b1 100644
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_main.c
+++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_main.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -567,6 +566,10 @@ static const fmd_prop_t fmd_props[] = {
{ "thresh_abs_sysmem", FMD_TYPE_UINT64, "0" },
{ "thresh_abs_badrw", FMD_TYPE_UINT64, "128" },
{ "max_perm_ce_dimm", FMD_TYPE_UINT32, "128" },
+ { "low_ce_thresh", FMD_TYPE_UINT32, "128" },
+ { "hi_ce_thresh", FMD_TYPE_UINT32, "512" },
+ { "dupce", FMD_TYPE_UINT32, "120"},
+ { "nupos", FMD_TYPE_UINT32, "4"},
#ifdef sun4v
{ "fbr_n", FMD_TYPE_UINT32, "14" },
{ "fbr_t", FMD_TYPE_TIME, "30min"},
@@ -834,6 +837,11 @@ _fmd_init(fmd_hdl_t *hdl)
cmd.cmd_xxcu_ntrw = fmd_prop_get_int32(hdl, "num_xxcu_waiters");
cmd.cmd_xxcu_trw = fmd_hdl_zalloc(hdl, sizeof (cmd_xxcu_trw_t) *
cmd.cmd_xxcu_ntrw, FMD_SLEEP);
+ cmd.cmd_low_ce_thresh = fmd_prop_get_int32(hdl, "low_ce_thresh");
+ cmd.cmd_hi_ce_thresh = fmd_prop_get_int32(hdl, "hi_ce_thresh");
+ cmd.cmd_dupce = fmd_prop_get_int32(hdl, "dupce");
+ cmd.cmd_nupos = fmd_prop_get_int32(hdl, "nupos");
+
#ifdef sun4v
cmd.cmd_delta_ena = fmd_prop_get_int64(hdl, "delta_ena");
#endif
diff --git a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.h b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.h
index 926bf3c6be..3bb5013ffc 100644
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.h
+++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.h
@@ -128,6 +128,15 @@ extern cmd_evdisp_t cmd_mem_synd_check(fmd_hdl_t *, uint64_t, uint8_t,
uint16_t, uint8_t, cmd_cpu_t *);
extern void cmd_dimm_close(fmd_hdl_t *, void *);
extern void cmd_bank_close(fmd_hdl_t *, void *);
+extern int cmd_same_datapath_dimms(cmd_dimm_t *, cmd_dimm_t *);
+extern void cmd_gen_datapath_fault(fmd_hdl_t *, cmd_dimm_t *, cmd_dimm_t *,
+ uint16_t, nvlist_t *);
+extern void cmd_to_hashed_addr(uint64_t *, uint64_t, const char *);
+
+#ifdef sun4u
+extern char *cmd_cpu_getfrustr_by_id(fmd_hdl_t *, uint32_t);
+#endif
+
#ifdef sun4v
extern void cmd_branch_close(fmd_hdl_t *, void *);
extern cmd_evdisp_t cmd_fb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
diff --git a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_memerr.c b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_memerr.c
index 96d2ce7359..5ef66b5b42 100644
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_memerr.c
+++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_memerr.c
@@ -40,6 +40,7 @@
#include <strings.h>
#include <string.h>
#include <errno.h>
+#include <limits.h>
#include <fm/fmd_api.h>
#include <sys/fm/protocol.h>
#include <sys/async.h>
@@ -85,97 +86,256 @@ cmd_mem_name2type(const char *name, int minorvers)
return (CE_DISP_UNKNOWN);
}
+/*
+ * check if a dimm has n CEs with the same symbol-in-error
+ */
+static int
+upos_thresh_check(cmd_dimm_t *dimm, uint16_t upos, uint32_t threshold)
+{
+ int i;
+ cmd_mq_t *ip, *next;
+ int count = 0;
+
+ for (i = 0; i < CMD_MAX_CKWDS; i++) {
+ for (ip = cmd_list_next(&dimm->mq_root[i]); ip != NULL;
+ ip = next) {
+ next = cmd_list_next(ip);
+ if (ip->mq_unit_position == upos) {
+ count++;
+ if (count >= threshold)
+ return (1);
+ }
+ }
+ }
+ return (0);
+}
+
+/*
+ * check if smaller number of retired pages > 1/16 of larger
+ * number of retired pages
+ */
+static int
+check_bad_rw_retired_pages(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2)
+{
+ uint_t sret, lret;
+ double ratio;
+ uint_t d1_nretired, d2_nretired;
+
+ sret = lret = 0;
+
+ d1_nretired = d1->dimm_nretired;
+ d2_nretired = d2->dimm_nretired;
+
+ if (d1->dimm_bank != NULL)
+ d1_nretired += d1->dimm_bank->bank_nretired;
+
+ if (d2->dimm_bank != NULL)
+ d2_nretired += d2->dimm_bank->bank_nretired;
+
+ if (d2_nretired < d1_nretired) {
+ sret = d2_nretired;
+ lret = d1_nretired;
+ } else if (d2_nretired > d1_nretired) {
+ sret = d1_nretired;
+ lret = d2_nretired;
+ } else
+ return (0);
+
+ ratio = lret * CMD_PAGE_RATIO;
+
+ if (sret > ratio) {
+ fmd_hdl_debug(hdl, "sret=%d lret=%d ratio=%.3f\n",
+ sret, lret, ratio);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * check bad rw between two DIMMs
+ * the check succeeds if
+ * - each DIMM has 4 CEs with the same symbol-in-error.
+ * - the smaller number of retired pages > 1/16 larger number of retired pages
+ */
+static int
+check_bad_rw_between_dimms(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2,
+ uint16_t *rupos)
+{
+ int i;
+ cmd_mq_t *ip, *next;
+ uint16_t upos;
+
+ for (i = 0; i < CMD_MAX_CKWDS; i++) {
+ for (ip = cmd_list_next(&d1->mq_root[i]); ip != NULL;
+ ip = next) {
+ next = cmd_list_next(ip);
+ upos = ip->mq_unit_position;
+ if (upos_thresh_check(d1, upos, cmd.cmd_nupos)) {
+ if (upos_thresh_check(d2, upos,
+ cmd.cmd_nupos)) {
+ if (check_bad_rw_retired_pages(hdl,
+ d1, d2)) {
+ *rupos = upos;
+ return (1);
+ }
+ }
+ }
+ }
+ }
+
+ return (0);
+}
+
+static void
+bad_reader_writer_check(fmd_hdl_t *hdl, cmd_dimm_t *ce_dimm, nvlist_t *det)
+{
+ cmd_dimm_t *d, *next;
+ uint16_t upos;
+
+ for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
+ next = cmd_list_next(d);
+ if (d == ce_dimm)
+ continue;
+ if (!cmd_same_datapath_dimms(ce_dimm, d))
+ continue;
+ if (check_bad_rw_between_dimms(hdl, ce_dimm, d, &upos)) {
+ cmd_gen_datapath_fault(hdl, ce_dimm, d, upos, det);
+ cmd_dimm_save_symbol_error(ce_dimm, upos);
+ fmd_hdl_debug(hdl,
+ "check_bad_rw_dimms succeeded: %s %s",
+ ce_dimm->dimm_unum, d->dimm_unum);
+ return;
+ }
+ }
+}
+
+/*
+ * rule 5a checking. The check succeeds if
+ * - nretired >= 512
+ * - nretired >= 128 and (addr_hi - addr_low) / (nretired - 1) > 512KB
+ */
static void
ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
{
nvlist_t *flt;
fmd_case_t *cp;
- cmd_dimm_t *d;
- nvlist_t *dflt;
- uint_t nret, dret;
- int foundrw;
+ uint_t nret;
+ uint64_t delta_addr = 0;
- if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
+ if (dimm->dimm_flags & CMD_MEM_F_FAULTING)
/* We've already complained about this DIMM */
return;
- }
nret = dimm->dimm_nretired;
if (dimm->dimm_bank != NULL)
nret += dimm->dimm_bank->bank_nretired;
- if (!cmd_mem_thresh_check(hdl, nret))
- return; /* Don't warn until over specified % of system memory */
+ if (nret < cmd.cmd_low_ce_thresh)
+ return;
- /* Look for CEs on DIMMs in other banks */
- for (foundrw = 0, dret = 0, d = cmd_list_next(&cmd.cmd_dimms);
- d != NULL; d = cmd_list_next(d)) {
- if (d == dimm) {
- dret += d->dimm_nretired;
- continue;
- }
+ if (dimm->dimm_phys_addr_hi >= dimm->dimm_phys_addr_low)
+ delta_addr =
+ (dimm->dimm_phys_addr_hi - dimm->dimm_phys_addr_low) /
+ (nret - 1);
- if (dimm->dimm_bank != NULL && d->dimm_bank == dimm->dimm_bank)
- continue;
+ if (nret >= cmd.cmd_hi_ce_thresh || delta_addr > CMD_MQ_512KB) {
- if (d->dimm_nretired > cmd.cmd_thresh_abs_badrw) {
- foundrw = 1;
- dret += d->dimm_nretired;
- }
- }
+ dimm->dimm_flags |= CMD_MEM_F_FAULTING;
+ cmd_dimm_dirty(hdl, dimm);
- if (foundrw) {
- /*
- * Found a DIMM in another bank with a significant number of
- * retirements. Something strange is going on, perhaps in the
- * datapath or with a bad CPU. A real person will need to
- * figure out what's really happening. Emit a fault designed
- * to trigger just that.
- */
cp = fmd_case_open(hdl, NULL);
- for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL;
- d = cmd_list_next(d)) {
+ flt = cmd_dimm_create_fault(hdl, dimm,
+ "fault.memory.dimm-page-retires-excessive", CMD_FLTMAXCONF);
+ fmd_case_add_suspect(hdl, cp, flt);
+ fmd_case_solve(hdl, cp);
+ fmd_hdl_debug(hdl, "ce_thresh_check succeeded nretired %d\n",
+ nret);
- if (d != dimm && d->dimm_bank != NULL &&
- d->dimm_bank == dimm->dimm_bank)
- continue;
+ }
+}
- if (d->dimm_nretired <= cmd.cmd_thresh_abs_badrw)
- continue;
+/*
+ * rule 5b checking. The check succeeds if
+ * more than 120 non-intermittent CEs are reported against one symbol
+ * position of one afar in 72 hours.
+ */
+static void
+mq_5b_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
+{
+ nvlist_t *flt;
+ fmd_case_t *cp;
+ cmd_mq_t *ip, *next;
+ int cw;
- if (!(d->dimm_flags & CMD_MEM_F_FAULTING)) {
- d->dimm_flags |= CMD_MEM_F_FAULTING;
- cmd_dimm_dirty(hdl, d);
+ for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
+ for (ip = cmd_list_next(&dimm->mq_root[cw]);
+ ip != NULL; ip = next) {
+ next = cmd_list_next(ip);
+ if (ip->mq_dupce_count >= cmd.cmd_dupce) {
+ cp = fmd_case_open(hdl, NULL);
+ flt = cmd_dimm_create_fault(hdl, dimm,
+ "fault.memory.dimm-page-retires-excessive",
+ CMD_FLTMAXCONF);
+ dimm->dimm_flags |= CMD_MEM_F_FAULTING;
+ cmd_dimm_dirty(hdl, dimm);
+ fmd_case_add_suspect(hdl, cp, flt);
+ fmd_case_solve(hdl, cp);
+ fmd_hdl_debug(hdl,
+ "mq_5b_check succeeded: duplicate CE=%d",
+ ip->mq_dupce_count);
+ return;
}
-
- flt = cmd_dimm_create_fault(hdl, d,
- "fault.memory.datapath",
- d->dimm_nretired * 100 / dret);
- fmd_case_add_suspect(hdl, cp, flt);
}
+ }
+}
- fmd_case_solve(hdl, cp);
- return;
+/*
+ * delete the expired duplicate CE time stamps
+ */
+void
+mq_prune_dup(fmd_hdl_t *hdl, cmd_mq_t *ip, uint64_t now)
+{
+ tstamp_t *tsp, *next;
+
+ for (tsp = cmd_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
+ tsp = next) {
+ next = cmd_list_next(tsp);
+ if (tsp->tstamp < now - CMD_MQ_TIMELIM) {
+ cmd_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
+ fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
+ ip->mq_dupce_count--;
+ }
}
+}
- dimm->dimm_flags |= CMD_MEM_F_FAULTING;
- cmd_dimm_dirty(hdl, dimm);
+void
+mq_update(fmd_hdl_t *hdl, fmd_event_t *ep, cmd_mq_t *ip, uint64_t now,
+ uint32_t cpuid)
+{
+ tstamp_t *tsp;
- cp = fmd_case_open(hdl, NULL);
- dflt = cmd_dimm_create_fault(hdl, dimm,
- "fault.memory.dimm-page-retires-excessive",
- CMD_FLTMAXCONF);
- fmd_case_add_suspect(hdl, cp, dflt);
- fmd_case_solve(hdl, cp);
+ ip->mq_tstamp = now;
+ ip->mq_cpuid = cpuid;
+ ip->mq_ep = ep;
+
+ if (fmd_serd_exists(hdl, ip->mq_serdnm))
+ fmd_serd_destroy(hdl, ip->mq_serdnm);
+ fmd_serd_create(hdl, ip->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT);
+ (void) fmd_serd_record(hdl, ip->mq_serdnm, ep);
+
+ tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP);
+ tsp->tstamp = now;
+ cmd_list_append(&ip->mq_dupce_tstamp, tsp);
+ ip->mq_dupce_count++;
}
/* Create a fresh index block for MQSC CE correlation. */
-
cmd_mq_t *
mq_create(fmd_hdl_t *hdl, fmd_event_t *ep,
- uint64_t afar, uint16_t upos, uint64_t now)
+ uint64_t afar, uint16_t upos, uint64_t now, uint32_t cpuid)
{
cmd_mq_t *cp;
+ tstamp_t *tsp;
uint16_t ckwd = (afar & 0x30) >> 4;
cp = fmd_hdl_zalloc(hdl, sizeof (cmd_mq_t), FMD_SLEEP);
@@ -187,6 +347,12 @@ mq_create(fmd_hdl_t *hdl, fmd_event_t *ep,
cp->mq_serdnm =
cmd_mq_serdnm_create(hdl, "mq", afar, ckwd, upos);
+ tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP);
+ tsp->tstamp = now;
+ cmd_list_append(&cp->mq_dupce_tstamp, tsp);
+ cp->mq_dupce_count = 1;
+ cp->mq_cpuid = cpuid;
+
/*
* Create SERD to keep this event from being removed
* by fmd which may not know there is an event pointer
@@ -210,14 +376,22 @@ cmd_mq_t *
mq_destroy(fmd_hdl_t *hdl, cmd_list_t *lp, cmd_mq_t *ip)
{
cmd_mq_t *jp = cmd_list_next(ip);
+ tstamp_t *tsp, *next;
if (ip->mq_serdnm != NULL) {
- if (fmd_serd_exists(hdl, ip->mq_serdnm)) {
+ if (fmd_serd_exists(hdl, ip->mq_serdnm))
fmd_serd_destroy(hdl, ip->mq_serdnm);
- }
fmd_hdl_strfree(hdl, ip->mq_serdnm);
ip->mq_serdnm = NULL;
}
+
+ for (tsp = cmd_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
+ tsp = next) {
+ next = cmd_list_next(tsp);
+ cmd_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
+ fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
+ }
+
cmd_list_delete(lp, &ip->mq_l);
fmd_hdl_free(hdl, ip, sizeof (cmd_mq_t));
@@ -232,7 +406,7 @@ mq_destroy(fmd_hdl_t *hdl, cmd_list_t *lp, cmd_mq_t *ip)
void
mq_add(fmd_hdl_t *hdl, cmd_dimm_t *dimm, fmd_event_t *ep,
- uint64_t afar, uint16_t synd, uint64_t now)
+ uint64_t afar, uint16_t synd, uint64_t now, uint32_t cpuid)
{
cmd_mq_t *ip, *jp;
int cw, unit_position;
@@ -249,16 +423,16 @@ mq_add(fmd_hdl_t *hdl, cmd_dimm_t *dimm, fmd_event_t *ep,
ip->mq_phys_addr == afar) {
/*
* Found a duplicate cw, unit_position, and afar.
- * Delete this node, to be superseded by the new
- * node added below.
+ * update the mq_t with the new information
*/
- ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
+ mq_update(hdl, ep, ip, now, cpuid);
+ return;
} else {
ip = cmd_list_next(ip);
}
}
- jp = mq_create(hdl, ep, afar, unit_position, now);
+ jp = mq_create(hdl, ep, afar, unit_position, now, cpuid);
if (ip == NULL)
cmd_list_append(&dimm->mq_root[cw], jp);
else
@@ -286,6 +460,7 @@ mq_prune(fmd_hdl_t *hdl, cmd_dimm_t *dimm, uint64_t now)
ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
} else {
/* tstamp < now - ce_t */
+ mq_prune_dup(hdl, ip, now);
ip = cmd_list_next(ip);
}
} /* per checkword */
@@ -402,6 +577,12 @@ cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
cmd_dimm_t *dimm;
cmd_page_t *page;
const char *uuid;
+ uint64_t *now;
+ uint_t nelem;
+ uint32_t cpuid;
+ nvlist_t *det;
+ uint64_t addr;
+ int skip_error = 0;
if (afar_status != AFLT_STAT_VALID ||
synd_status != AFLT_STAT_VALID)
@@ -433,19 +614,36 @@ cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
&dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
}
+ if (nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &det) != 0)
+ return (CMD_EVD_BAD);
+
/*
* Add to MQSC correlation lists all CEs which pass validity
* checks above.
+ * Add mq_t when there is no bad r/w or dimm fault.
+ * Always prune the expired mq_t.
*/
- if (!(dimm->dimm_flags & CMD_MEM_F_FAULTING)) {
- uint64_t *now;
- uint_t nelem;
- if (nvlist_lookup_uint64_array(nvl,
- "__tod", &now, &nelem) == 0) {
-
- mq_add(hdl, dimm, ep, afar, synd, *now);
- mq_prune(hdl, dimm, *now);
+ skip_error = cmd_dimm_check_symbol_error(dimm, synd);
+
+ if (nvlist_lookup_uint64_array(nvl,
+ "__tod", &now, &nelem) == 0) {
+
+ if (!skip_error || !(dimm->dimm_flags & CMD_MEM_F_FAULTING)) {
+ if (nvlist_lookup_uint32(det, FM_FMRI_CPU_ID, &cpuid)
+ != 0)
+ cpuid = ULONG_MAX;
+
+ mq_add(hdl, dimm, ep, afar, synd, *now, cpuid);
+ }
+
+ mq_prune(hdl, dimm, *now);
+
+ if (!skip_error)
+ bad_reader_writer_check(hdl, dimm, det);
+
+ if (!(dimm->dimm_flags & CMD_MEM_F_FAULTING)) {
mq_check(hdl, dimm);
+ mq_5b_check(hdl, dimm);
}
}
@@ -495,6 +693,9 @@ cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
return (CMD_EVD_BAD);
}
+ if (cmd_dimm_check_symbol_error(dimm, synd))
+ return (CMD_EVD_REDUND);
+
if (page == NULL)
page = cmd_page_create(hdl, asru, afar);
@@ -534,6 +735,21 @@ cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
break; /* to retire */
}
+ if (page->page_flags & CMD_MEM_F_FAULTING ||
+ fmd_nvl_fmri_unusable(hdl, page->page_asru_nvl))
+ return (CMD_EVD_OK);
+
+ /*
+ * convert a unhashed address to hashed address
+ */
+ cmd_to_hashed_addr(&addr, afar, class);
+
+ if (afar > dimm->dimm_phys_addr_hi)
+ dimm->dimm_phys_addr_hi = addr;
+
+ if (afar < dimm->dimm_phys_addr_low)
+ dimm->dimm_phys_addr_low = addr;
+
dimm->dimm_nretired++;
dimm->dimm_retstat.fmds_value.ui64++;
cmd_dimm_dirty(hdl, dimm);
diff --git a/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_memerr_arch.c b/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_memerr_arch.c
index c21b85a638..431b7459c9 100644
--- a/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_memerr_arch.c
+++ b/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_memerr_arch.c
@@ -37,6 +37,7 @@
#include <strings.h>
#include <string.h>
#include <errno.h>
+#include <limits.h>
#include <unistd.h>
#include <fm/fmd_api.h>
#include <sys/fm/protocol.h>
@@ -628,3 +629,303 @@ cmd_synd2upos(uint16_t syndrome) {
}
const char *fmd_fmri_get_platform();
+
+#define DP_MAX 25
+
+const char *slotname[] = {
+ "Slot A", "Slot B", "Slot C", "Slot D"};
+
+typedef struct fault_info {
+ uint32_t id;
+ int count;
+} fault_info_t;
+
+struct plat2id_map {
+ char *platnm;
+ int id;
+} id_plat[] = {
+ {"SUNW,Sun-Fire-15000", 1},
+ {"SUNW,Sun-Fire", 2},
+ {"SUNW,Netra-T12", 2},
+ {"SUNW,Sun-Fire-480R", 3},
+ {"SUNW,Sun-Fire-V490", 3},
+ {"SUNW,Sun-Fire-V440", 3},
+ {"SUNW,Sun-Fire-V445", 3},
+ {"SUNW,Netra-440", 3},
+ {"SUNW,Sun-Fire-880", 4},
+ {"SUNW,Sun-Fire-V890", 4},
+ {NULL, 0}
+};
+
+/*ARGSUSED*/
+void
+cmd_to_hashed_addr(uint64_t *addr, uint64_t afar, const char *class)
+{
+ *addr = afar;
+}
+
+/*ARGSUSED*/
+int
+cmd_same_datapath_dimms(cmd_dimm_t *d1, cmd_dimm_t *d2)
+{
+ return (1);
+}
+
+static int
+cmd_get_platform()
+{
+ const char *platname;
+ int id = -1;
+ int i;
+
+ platname = fmd_fmri_get_platform();
+ for (i = 0; id_plat[i].platnm != NULL; i++) {
+ if (strcmp(platname, id_plat[i].platnm) == 0) {
+ id = id_plat[i].id;
+ break;
+ }
+ }
+ return (id);
+}
+
+static int
+cmd_get_boardid(uint32_t cpuid)
+{
+ int boardid;
+ int id = cmd_get_platform();
+
+ switch (id) {
+ case 1:
+ boardid = ((cpuid >> 5) & 0x1f);
+ break;
+ case 2:
+ boardid = ((cpuid & 0x1f) / 4);
+ break;
+
+ case 3:
+ cpuid = cpuid & 0x07;
+ boardid = ((cpuid % 2) == 0) ? 0 : 1;
+ break;
+ case 4:
+ cpuid = cpuid & 0x07;
+ if ((cpuid % 2) == 0)
+ boardid = (cpuid < 4) ? 0 : 2;
+ else
+ boardid = (cpuid < 5) ? 1 : 3;
+ break;
+ default:
+ boardid = 5;
+ break;
+ }
+
+ return (boardid);
+}
+
+static void
+cmd_get_faulted_comp(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2,
+ uint16_t upos, fault_info_t **fault_list, int cpu)
+{
+ cmd_mq_t *ip;
+ int i, j, k, idj;
+ uint32_t id;
+ uint32_t *cpuid = NULL;
+ int max_rpt;
+
+ max_rpt = 2 * cmd.cmd_nupos;
+
+ cpuid = fmd_hdl_alloc(hdl, max_rpt * sizeof (uint32_t), FMD_SLEEP);
+
+ if (cpuid == NULL)
+ return;
+
+ for (i = 0, j = 0; i < CMD_MAX_CKWDS; i++) {
+ for (ip = cmd_list_next(&d1->mq_root[i]); ip != NULL;
+ ip = cmd_list_next(ip)) {
+ if (upos == ip->mq_unit_position) {
+ cpuid[j] = ip->mq_cpuid;
+ j++;
+ }
+ if (j >= cmd.cmd_nupos)
+ break;
+ }
+ if (j >= cmd.cmd_nupos)
+ break;
+ }
+
+ for (i = 0; i < CMD_MAX_CKWDS; i++) {
+ for (ip = cmd_list_next(&d2->mq_root[i]); ip != NULL;
+ ip = cmd_list_next(ip)) {
+ if (upos == ip->mq_unit_position) {
+ cpuid[j] = ip->mq_cpuid;
+ j++;
+ }
+ if (j >= max_rpt)
+ break;
+ }
+ if (j >= max_rpt)
+ break;
+ }
+
+ for (i = 0, k = 0; i < max_rpt; i++) {
+ if (cpuid[i] == ULONG_MAX)
+ continue;
+ id = (cpu == 0) ? cmd_get_boardid(cpuid[i]) : cpuid[i];
+ fault_list[k] = fmd_hdl_alloc(hdl,
+ sizeof (fault_info_t), FMD_SLEEP);
+ if (fault_list[k] == NULL)
+ break;
+ fault_list[k]->count = 1;
+ fault_list[k]->id = id;
+ for (j = i + 1; j < max_rpt; j++) {
+ if (cpuid[j] == ULONG_MAX)
+ continue;
+ idj = (cpu == 0) ? cmd_get_boardid(cpuid[j]) : cpuid[j];
+ if (id == idj) {
+ fault_list[k]->count++;
+ cpuid[j] = ULONG_MAX;
+ }
+ }
+ k++;
+ }
+
+ fmd_hdl_free(hdl, cpuid, max_rpt * sizeof (uint32_t));
+}
+
+/*ARGSUSED*/
+static nvlist_t *
+cmd_board_mkfru(fmd_hdl_t *hdl, char *frustr)
+{
+ nvlist_t *hcel, *fru;
+ int err;
+
+ if (frustr == NULL)
+ return (NULL);
+
+ if (nvlist_alloc(&hcel, NV_UNIQUE_NAME, 0) != 0)
+ return (NULL);
+
+ err = nvlist_add_string(hcel, FM_FMRI_HC_NAME,
+ FM_FMRI_LEGACY_HC);
+ err |= nvlist_add_string(hcel, FM_FMRI_HC_ID, frustr);
+ if (err != 0) {
+ nvlist_free(hcel);
+ return (NULL);
+ }
+
+ if (nvlist_alloc(&fru, NV_UNIQUE_NAME, 0) != 0) {
+ nvlist_free(hcel);
+ return (NULL);
+ }
+ err = nvlist_add_uint8(fru, FM_VERSION, FM_HC_SCHEME_VERSION);
+ err |= nvlist_add_string(fru, FM_FMRI_SCHEME,
+ FM_FMRI_SCHEME_HC);
+ err |= nvlist_add_string(fru, FM_FMRI_HC_ROOT, "");
+ err |= nvlist_add_uint32(fru, FM_FMRI_HC_LIST_SZ, 1);
+ err |= nvlist_add_nvlist_array(fru, FM_FMRI_HC_LIST, &hcel, 1);
+ if (err != 0) {
+ nvlist_free(fru);
+ nvlist_free(hcel);
+ return (NULL);
+ }
+ nvlist_free(hcel);
+ return (fru);
+}
+
+/*
+ * Startcat, Serengeti, V4xx, and V8xx: fault the system boards of
+ * the detectors in proportion to the number of ereports out of 8
+ * Other systems: fault the detectors in proportion to the number of
+ * ereports out of 8
+ */
+void
+cmd_gen_datapath_fault(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2,
+ uint16_t upos, nvlist_t *det)
+{
+ char frustr[DP_MAX];
+ fmd_case_t *cp;
+ int i, ratio, type, fault_cpu, max_rpt;
+ uint32_t id;
+ uint8_t cpumask;
+ char *cpustr;
+ fault_info_t **fault_list = NULL;
+ nvlist_t *fru = NULL, *asru = NULL, *flt = NULL;
+
+ max_rpt = cmd.cmd_nupos * 2;
+ fault_list = fmd_hdl_alloc(hdl,
+ max_rpt * sizeof (fault_info_t *), FMD_SLEEP);
+
+ if (fault_list == NULL)
+ return;
+
+ for (i = 0; i < max_rpt; i++)
+ fault_list[i] = NULL;
+
+ type = cmd_get_platform();
+
+ fault_cpu = (type == -1) ? 1 : 0;
+
+ cmd_get_faulted_comp(hdl, d1, d2, upos, fault_list, fault_cpu);
+
+ cp = fmd_case_open(hdl, NULL);
+
+ for (i = 0; i < max_rpt; i++) {
+ if (fault_list[i] == NULL)
+ continue;
+ id = fault_list[i]->id;
+
+ switch (type) {
+ case 1:
+ (void) snprintf(frustr, DP_MAX, "EX%d", id);
+ break;
+ case 2:
+ (void) snprintf(frustr, DP_MAX, "/N0/SB%d", id);
+ break;
+ case 3:
+ case 4:
+ (void) snprintf(frustr, DP_MAX, slotname[id]);
+ break;
+ default:
+ cpustr = cmd_cpu_getfrustr_by_id(hdl, id);
+ if (nvlist_lookup_uint8(det, FM_FMRI_CPU_MASK, &cpumask)
+ == 0) {
+ asru = cmd_cpu_fmri_create(id, cpumask);
+ (void) fmd_nvl_fmri_expand(hdl, asru);
+ }
+ break;
+ }
+
+ ratio = (fault_list[i]->count * 100) / (cmd.cmd_nupos * 2);
+
+ if (fault_cpu) {
+ fru = cmd_cpu_mkfru(hdl, cpustr, NULL, NULL);
+ fmd_hdl_strfree(hdl, cpustr);
+ if (fru == NULL) {
+ nvlist_free(asru);
+ break;
+ }
+ flt = cmd_nvl_create_fault(hdl, "fault.memory.datapath",
+ ratio, asru, fru, asru);
+ nvlist_free(asru);
+ } else {
+ fru = cmd_board_mkfru(hdl, frustr);
+ if (fru == NULL)
+ break;
+ flt = cmd_nvl_create_fault(hdl, "fault.memory.datapath",
+ ratio, fru, fru, fru);
+ }
+
+ fmd_case_add_suspect(hdl, cp, flt);
+
+ /* free up memory */
+ nvlist_free(fru);
+ }
+
+ fmd_case_solve(hdl, cp);
+
+ for (i = 0; i < max_rpt; i++) {
+ if (fault_list[i] != NULL)
+ fmd_hdl_free(hdl, fault_list[i], sizeof (fault_info_t));
+ }
+
+ fmd_hdl_free(hdl, fault_list, sizeof (fault_info_t *) * max_rpt);
+}
diff --git a/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_memerr_arch.c b/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_memerr_arch.c
index ad255bbe8b..c33416f1c1 100644
--- a/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_memerr_arch.c
+++ b/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_memerr_arch.c
@@ -55,6 +55,11 @@
#define VF_TS3_FCR 0x000000000000FFFFULL
#define VF_L2ESYR_C2C 0x8000000000000000ULL
+#define OFFBIT 0xFFFFFFFFFFFC07FFULL
+#define BIT28_32 0x00000001F0000000ULL
+#define BIT13_17 0x000000000003E000ULL
+#define BIT18_19 0x00000000000C0000ULL
+#define BIT11_12 0x0000000000001800ULL
#define UTS2_CPUS_PER_CHIP 64
#define FBR_ERROR ".fbr"
#define DSU_ERROR ".dsu"
@@ -679,3 +684,77 @@ cmd_mem2hc(fmd_hdl_t *hdl, nvlist_t *mem_fmri) {
return (cmd_find_dimm_by_sn(hdl, FM_FMRI_SCHEME_HC, *snp));
}
+
+/*
+ * formula to convert an unhashed address to hashed address
+ * PA[17:11] = (PA[32:28] xor PA[17:13]) :: ((PA[19:18] xor PA[12:11])
+ */
+void
+cmd_to_hashed_addr(uint64_t *addr, uint64_t afar, const char *class)
+{
+
+ if (strstr(class, "ultraSPARC-T1") != NULL)
+ *addr = afar;
+ else {
+ *addr = (afar & OFFBIT) |
+ ((afar & BIT28_32) >> 15) ^ (afar & BIT13_17) |
+ ((afar & BIT18_19) >> 7) ^ (afar & BIT11_12);
+ }
+}
+
+int
+cmd_same_datapath_dimms(cmd_dimm_t *d1, cmd_dimm_t *d2)
+{
+ char *p, *q;
+
+ p = strstr(d1->dimm_unum, "CMP");
+ q = strstr(d2->dimm_unum, "CMP");
+ if (p != NULL && q != NULL) {
+ if (strncmp(p, q, 4) == 0)
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * fault the FRU of the common CMP
+ */
+/*ARGSUSED*/
+void
+cmd_gen_datapath_fault(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2,
+ uint16_t upos, nvlist_t *det)
+{
+ fmd_case_t *cp;
+ char *frustr;
+ nvlist_t *rsrc, *fltlist;
+ char *s;
+ uint_t len;
+
+ s = strstr(d1->dimm_unum, "CMP");
+ if (s == NULL)
+ return;
+
+ frustr = fmd_hdl_zalloc(hdl, strlen(d1->dimm_unum), FMD_SLEEP);
+ len = strlen(d1->dimm_unum) - strlen(s);
+
+ (void) strncpy(frustr, d1->dimm_unum, len);
+
+ rsrc = cmd_mkboard_fru(hdl, frustr, NULL, NULL);
+
+ fmd_hdl_free(hdl, frustr, strlen(d1->dimm_unum));
+
+ if (rsrc == NULL)
+ return;
+
+ (void) nvlist_add_nvlist(rsrc, FM_FMRI_AUTHORITY, cmd.cmd_auth);
+
+ cp = fmd_case_open(hdl, NULL);
+
+ fltlist = fmd_nvl_create_fault(hdl, "fault.memory.datapath", 100,
+ rsrc, NULL, rsrc);
+
+ fmd_case_add_suspect(hdl, cp, fltlist);
+ fmd_case_solve(hdl, cp);
+
+ nvlist_free(rsrc);
+}
diff --git a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem.h b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem.h
index 2d193666c9..f00a53e07e 100644
--- a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem.h
+++ b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _GMEM_H
@@ -49,6 +48,7 @@ typedef struct gmem_stat {
fmd_stat_t ce_interm; /* # of intermittent CEs seen */
fmd_stat_t ce_clearable_persis; /* # of clearable persistent CEs seen */
fmd_stat_t ce_sticky; /* # of sticky CEs seen */
+ fmd_stat_t dimm_migrat; /* # of DIMMs migrated to new version */
} gmem_stat_t;
typedef struct gmem_serd {
@@ -67,6 +67,9 @@ typedef struct gmem {
uint32_t gm_ce_n; /* serd n */
uint64_t gm_ce_t; /* serd t */
uint32_t gm_filter_ratio; /* serd filter ratio */
+ uint32_t gm_low_ce_thresh; /* low threshold retired pages */
+ uint32_t gm_nupos; /* same number of upos */
+ uint32_t gm_dupce; /* number of dup CEs */
} gmem_t;
extern gmem_t gmem;
diff --git a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.c b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.c
index 11d071939a..b6cf321e87 100644
--- a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.c
+++ b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
@@ -32,6 +31,7 @@
#include <gmem_dimm.h>
#include <gmem.h>
#include <errno.h>
+#include <limits.h>
#include <string.h>
#include <strings.h>
#include <fcntl.h>
@@ -61,6 +61,7 @@ gmem_dimm_free(fmd_hdl_t *hdl, gmem_dimm_t *dimm, int destroy)
gmem_case_t *cc = &dimm->dimm_case;
int i;
gmem_mq_t *q;
+ tstamp_t *tsp, *next;
if (cc->cc_cp != NULL) {
gmem_case_fini(hdl, cc->cc_cp, destroy);
@@ -82,6 +83,15 @@ gmem_dimm_free(fmd_hdl_t *hdl, gmem_dimm_t *dimm, int destroy)
fmd_hdl_strfree(hdl, q->mq_serdnm);
q->mq_serdnm = NULL;
}
+
+ for (tsp = gmem_list_next(&q->mq_dupce_tstamp);
+ tsp != NULL; tsp = next) {
+ next = gmem_list_next(tsp);
+ gmem_list_delete(&q->mq_dupce_tstamp,
+ &tsp->ts_l);
+ fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
+ }
+
gmem_list_delete(&dimm->mq_root[i], q);
fmd_hdl_free(hdl, q, sizeof (gmem_mq_t));
}
@@ -139,6 +149,9 @@ gmem_dimm_create(fmd_hdl_t *hdl, nvlist_t *asru)
dimm = fmd_hdl_zalloc(hdl, sizeof (gmem_dimm_t), FMD_SLEEP);
dimm->dimm_nodetype = GMEM_NT_DIMM;
dimm->dimm_version = GMEM_DIMM_VERSION;
+ dimm->dimm_phys_addr_low = ULLONG_MAX;
+ dimm->dimm_phys_addr_hi = 0;
+ dimm->dimm_syl_error = USHRT_MAX;
gmem_bufname(dimm->dimm_bufname, sizeof (dimm->dimm_bufname), "dimm_%s",
serial);
@@ -177,8 +190,30 @@ gmem_dimm_lookup(fmd_hdl_t *hdl, nvlist_t *asru)
return (dimm);
}
+
static gmem_dimm_t *
-gmem_dimm_wrapv0(fmd_hdl_t *hdl, gmem_dimm_pers_t *pers, size_t psz)
+gmem_dimm_v0tov1(fmd_hdl_t *hdl, gmem_dimm_0_t *old, size_t oldsz)
+{
+ gmem_dimm_t *new;
+ if (oldsz != sizeof (gmem_dimm_0_t)) {
+ fmd_hdl_abort(hdl, "size of state doesn't match size of "
+ "version 0 state (%u bytes).\n", sizeof (gmem_dimm_0_t));
+ }
+
+ new = fmd_hdl_zalloc(hdl, sizeof (gmem_dimm_t), FMD_SLEEP);
+ new->dimm_header = old->dimm0_header;
+ new->dimm_version = GMEM_DIMM_VERSION;
+ new->dimm_asru = old->dimm0_asru;
+ new->dimm_nretired = old->dimm0_nretired;
+ new->dimm_phys_addr_hi = 0;
+ new->dimm_phys_addr_low = ULLONG_MAX;
+
+ fmd_hdl_free(hdl, old, oldsz);
+ return (new);
+}
+
+static gmem_dimm_t *
+gmem_dimm_wrapv1(fmd_hdl_t *hdl, gmem_dimm_pers_t *pers, size_t psz)
{
gmem_dimm_t *dimm;
@@ -205,6 +240,7 @@ gmem_dimm_restore(fmd_hdl_t *hdl, fmd_case_t *cp, gmem_case_ptr_t *ptr)
}
if (dimm == NULL) {
+ int migrated = 0;
size_t dimmsz;
fmd_hdl_debug(hdl, "restoring dimm from %s\n", ptr->ptr_name);
@@ -230,16 +266,28 @@ gmem_dimm_restore(fmd_hdl_t *hdl, fmd_case_t *cp, gmem_case_ptr_t *ptr)
fmd_hdl_debug(hdl, "found %d in version field\n",
dimm->dimm_version);
- switch (dimm->dimm_version) {
- case GMEM_DIMM_VERSION_0:
- dimm = gmem_dimm_wrapv0(hdl, (gmem_dimm_pers_t *)dimm,
+ if (GMEM_DIMM_VERSIONED(dimm)) {
+
+ switch (dimm->dimm_version) {
+ case GMEM_DIMM_VERSION_1:
+ dimm = gmem_dimm_wrapv1(hdl,
+ (gmem_dimm_pers_t *)dimm, dimmsz);
+ break;
+ default:
+ fmd_hdl_abort(hdl, "unknown version (found %d) "
+ "for dimm state referenced by case %s.\n",
+ dimm->dimm_version, fmd_case_uuid(hdl, cp));
+ break;
+ }
+ } else {
+ dimm = gmem_dimm_v0tov1(hdl, (gmem_dimm_0_t *)dimm,
dimmsz);
- break;
- default:
- fmd_hdl_abort(hdl, "unknown version (found %d) "
- "for dimm state referenced by case %s.\n",
- dimm->dimm_version, fmd_case_uuid(hdl, cp));
- break;
+ migrated = 1;
+ }
+
+ if (migrated) {
+ GMEM_STAT_BUMP(dimm_migrat);
+ gmem_dimm_dirty(hdl, dimm);
}
gmem_fmri_restore(hdl, &dimm->dimm_asru);
@@ -432,3 +480,82 @@ gmem_dimm_present(fmd_hdl_t *hdl, nvlist_t *asru)
nvlist_free(dimm);
return (1);
}
+
+static int
+gmem_find_dimm_chip(nvlist_t *nvl, uint32_t *chip)
+{
+
+ char *name, *id, *end;
+ nvlist_t **hcl;
+ uint_t n;
+ int i;
+ int rc = 0;
+
+ if (nvlist_lookup_nvlist_array(nvl, FM_FMRI_HC_LIST, &hcl, &n) < 0)
+ return (0);
+ for (i = 0; i < n; i++) {
+ (void) nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, &name);
+ (void) nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &id);
+
+ if (strcmp(name, "chip") == 0) {
+ *chip = (uint32_t)strtoul(id, &end, 10);
+ rc = 1;
+ break;
+ }
+ }
+ return (rc);
+}
+
+int
+gmem_same_datapath_dimms(fmd_hdl_t *hdl, gmem_dimm_t *d1, gmem_dimm_t *d2)
+{
+ nvlist_t *rsrc1, *rsrc2;
+ uint32_t chip1, chip2;
+
+ rsrc1 = gmem_find_dimm_rsc(hdl, d1->dimm_serial);
+ rsrc2 = gmem_find_dimm_rsc(hdl, d2->dimm_serial);
+
+ if (rsrc1 == NULL || rsrc2 == NULL)
+ return (0);
+
+ if (gmem_find_dimm_chip(rsrc1, &chip1) &&
+ gmem_find_dimm_chip(rsrc2, &chip2)) {
+ if (chip1 == chip2) {
+ nvlist_free(rsrc1);
+ nvlist_free(rsrc2);
+ return (1);
+ }
+ }
+
+ nvlist_free(rsrc1);
+ nvlist_free(rsrc2);
+ return (0);
+}
+
+int
+gmem_check_symbol_error(fmd_hdl_t *hdl, gmem_dimm_t *d, uint16_t upos)
+{
+ gmem_dimm_t *dimm = NULL, *next = NULL;
+
+ for (dimm = gmem_list_next(&gmem.gm_dimms); dimm != NULL;
+ dimm = next) {
+ next = gmem_list_next(dimm);
+ if (gmem_same_datapath_dimms(hdl, dimm, d) &&
+ dimm->dimm_syl_error == upos)
+ return (1);
+ }
+ return (0);
+}
+
+void
+gmem_save_symbol_error(fmd_hdl_t *hdl, gmem_dimm_t *d, uint16_t upos)
+{
+ gmem_dimm_t *dimm = NULL, *next = NULL;
+
+ for (dimm = gmem_list_next(&gmem.gm_dimms); dimm != NULL;
+ dimm = next) {
+ next = gmem_list_next(dimm);
+ if (gmem_same_datapath_dimms(hdl, dimm, d))
+ dimm->dimm_syl_error = upos;
+ }
+}
diff --git a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.h b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.h
index 1f24f8686b..c548bce639 100644
--- a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.h
+++ b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_dimm.h
@@ -59,18 +59,29 @@ extern "C" {
#define DIMM_MKVERSION(version) (version)
#define GMEM_DIMM_VERSION_0 DIMM_MKVERSION(0)
-#define GMEM_DIMM_VERSION GMEM_DIMM_VERSION_0
+#define GMEM_DIMM_VERSION_1 DIMM_MKVERSION(1)
+#define GMEM_DIMM_VERSION GMEM_DIMM_VERSION_1
#define GMEM_DIMM_VERSIONED(dimm) ((dimm)->dimm_version)
#define GMEM_DIMM_STAT_PREFIX "DIMM_"
+typedef struct gmem_dimm_0 {
+ gmem_header_t dimm0_header; /* Nodetype must be GMEM_NT_DIMM */
+ uint_t dimm0_version; /* DIMM version */
+ gmem_fmri_t dimm0_asru; /* ASRU for this DIMM */
+ uint_t dimm0_flags; /* GMEM_MEM_F_* */
+ uint_t dimm0_nretired; /* # ret'd pages for CEs in DIMM */
+} gmem_dimm_0_t;
+
typedef struct gmem_dimm_pers {
gmem_header_t dimmp_header; /* Nodetype must be GMEM_NT_DIMM */
uint_t dimmp_version;
gmem_fmri_t dimmp_asru; /* ASRU for this DIMM */
uint_t dimmp_flags; /* GMEM_MEM_F_* */
uint_t dimmp_nretired; /* # ret'd pages for CEs in DIMM */
+ uint64_t dimmp_phys_addr_hi; /* retired page addr high */
+ uint64_t dimmp_phys_addr_low; /* retired page addr low */
} gmem_dimm_pers_t;
/*
@@ -87,13 +98,21 @@ typedef struct gmem_mq {
uint16_t mq_unit_position; /* bit for sun4u, nibble for sun4v */
fmd_event_t *mq_ep; /* ereport - for potential fault */
char *mq_serdnm; /* serd eng to retain CE events */
+ uint16_t mq_dupce_count; /* count dup CEs */
+ gmem_list_t mq_dupce_tstamp; /* list dup CE time stamp */
} gmem_mq_t;
+typedef struct tstamp {
+ gmem_list_t ts_l;
+ uint64_t tstamp;
+} tstamp_t;
+
struct gmem_dimm {
gmem_dimm_pers_t dimm_pers;
char *dimm_serial; /* Dimm serial number */
gmem_case_t dimm_case; /* Open CE case against this DIMM */
fmd_stat_t dimm_retstat; /* retirement statistics, this DIMM */
+ uint16_t dimm_syl_error; /* bad rw symbol-in-error */
gmem_list_t
mq_root[GMEM_MAX_CKWDS]; /* per-checkword CEs to correlate */
};
@@ -101,9 +120,12 @@ struct gmem_dimm {
#define GMEM_MQ_TIMELIM (72*60*60) /* 72 hours */
#define GMEM_MQ_SERDT MAXINT /* Never expected to fire */
#define GMEM_MQ_SERDN 2 /* Dup CEs not allowed */
+#define GMEM_MQ_512KB 0x80000 /* space between hi-low addr */
+#define GMEM_MQ_RATIO 0.0625 /* bad r/w page ratio (1/16) */
+
-#define GMEM_DIMM_MAXSIZE sizeof (gmem_dimm_pers_t)
-#define GMEM_DIMM_MINSIZE sizeof (gmem_dimm_pers_t)
+#define GMEM_DIMM_MAXSIZE MAX(sizeof (gmem_dimm_0_t), sizeof (gmem_dimm_pers_t))
+#define GMEM_DIMM_MINSIZE MIN(sizeof (gmem_dimm_0_t), sizeof (gmem_dimm_pers_t))
#define dimm_header dimm_pers.dimmp_header
#define dimm_nodetype dimm_pers.dimmp_header.hdr_nodetype
@@ -113,6 +135,8 @@ struct gmem_dimm {
#define dimm_asru_nvl dimm_pers.dimmp_asru.fmri_nvl
#define dimm_flags dimm_pers.dimmp_flags
#define dimm_nretired dimm_pers.dimmp_nretired
+#define dimm_phys_addr_hi dimm_pers.dimmp_phys_addr_hi
+#define dimm_phys_addr_low dimm_pers.dimmp_phys_addr_low
extern gmem_dimm_t *gmem_dimm_lookup(fmd_hdl_t *, nvlist_t *);
extern gmem_dimm_t *gmem_dimm_create(fmd_hdl_t *, nvlist_t *);
@@ -129,6 +153,9 @@ extern void gmem_dimm_destroy(fmd_hdl_t *, gmem_dimm_t *);
extern void gmem_dimm_validate(fmd_hdl_t *);
extern void gmem_dimm_gc(fmd_hdl_t *);
extern void gmem_dimm_fini(fmd_hdl_t *);
+extern int gmem_same_datapath_dimms(fmd_hdl_t *, gmem_dimm_t *, gmem_dimm_t *);
+extern int gmem_check_symbol_error(fmd_hdl_t *, gmem_dimm_t *, uint16_t);
+extern void gmem_save_symbol_error(fmd_hdl_t *, gmem_dimm_t *, uint16_t);
#ifdef __cplusplus
}
diff --git a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_main.c b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_main.c
index 3fe8c29be6..8886dde21e 100644
--- a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_main.c
+++ b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_main.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <gmem_state.h>
@@ -113,6 +112,7 @@ static gmem_stat_t gm_stats = {
{ "ce_interm", FMD_TYPE_UINT64, "intermittent CEs" },
{ "ce_clearable_persis", FMD_TYPE_UINT64, "clearable persistent CEs" },
{ "ce_sticky", FMD_TYPE_UINT64, "sticky CEs" },
+ { "dimm_migrat", FMD_TYPE_UINT64, "DIMMs migrated to new version" }
};
static const fmd_prop_t fmd_props[] = {
@@ -120,6 +120,9 @@ static const fmd_prop_t fmd_props[] = {
{ "ce_t", FMD_TYPE_TIME, "72h" },
{ "filter_ratio", FMD_TYPE_UINT32, "0" },
{ "max_retired_pages", FMD_TYPE_UINT32, "512" },
+ { "low_ce_thresh", FMD_TYPE_UINT32, "128"},
+ { "nupos", FMD_TYPE_UINT32, "4"},
+ { "dupce", FMD_TYPE_UINT32, "120"},
{ NULL, 0, NULL }
};
@@ -199,6 +202,10 @@ _fmd_init(fmd_hdl_t *hdl)
gmem.gm_ce_n = fmd_prop_get_int32(hdl, "ce_n");
gmem.gm_ce_t = fmd_prop_get_int64(hdl, "ce_t");
gmem.gm_filter_ratio = fmd_prop_get_int32(hdl, "filter_ratio");
+ gmem.gm_low_ce_thresh = fmd_prop_get_int32(hdl, "low_ce_thresh");
+ gmem.gm_nupos = fmd_prop_get_int32(hdl, "nupos");
+ gmem.gm_dupce = fmd_prop_get_int32(hdl, "dupce");
+
if (gmem_state_restore(hdl) < 0) {
_fmd_fini(hdl);
diff --git a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_mem.h b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_mem.h
index cee4f9611b..3510531d55 100644
--- a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_mem.h
+++ b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_mem.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _GMEM_MEM_H
@@ -43,6 +42,7 @@ extern "C" {
#define GMEM_ERPT_PAYLOAD_DIAGNOSE "diagnose"
#define GMEM_ERPT_PAYLOAD_RESOURCE "resource"
+#define GMEM_ERPT_PAYLOAD_DETECTOR "detector"
#define GMEM_ERPT_PAYLOAD_PHYSADDR "phys-addr"
#define GMEM_ERPT_PAYLOAD_OFFSET "offset"
#define GMEM_ERPT_PAYLOAD_SERDN "serd_n"
diff --git a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_memerr.c b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_memerr.c
index ac6805004d..3f5027e0f6 100644
--- a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_memerr.c
+++ b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_memerr.c
@@ -42,11 +42,19 @@
#include <sys/async.h>
#include <sys/errclassify.h>
+#define OFFBIT 0xFFFFFFFFFFFC07FFULL
+#define BIT28_32 0x00000001F0000000ULL
+#define BIT13_17 0x000000000003E000ULL
+#define BIT18_19 0x00000000000C0000ULL
+#define BIT11_12 0x0000000000001800ULL
+
struct ce_name2type {
const char *name;
ce_dispact_t type;
};
+nvlist_t *fru_nvl;
+
static ce_dispact_t
gmem_mem_name2type(const char *name)
{
@@ -68,36 +76,409 @@ gmem_mem_name2type(const char *name)
return (CE_DISP_UNKNOWN);
}
+/*ARGSUSED*/
+static int
+find_fault_fru(topo_hdl_t *thp, tnode_t *node, void *arg)
+{
+ nvlist_t *nvl = (nvlist_t *)arg;
+ nvlist_t *rsc = NULL, *fru = NULL;
+ nvlist_t **hcl, **topo_hcl;
+ uint_t n1, n2;
+ char *name, *name1, *name2;
+ char *id1, *id2;
+ int err, i;
+
+ if (topo_node_resource(node, &rsc, &err) < 0)
+ return (TOPO_WALK_NEXT);
+
+ err = nvlist_lookup_nvlist_array(rsc, FM_FMRI_HC_LIST, &topo_hcl, &n1);
+
+ if (err != 0) {
+ nvlist_free(rsc);
+ return (TOPO_WALK_NEXT);
+ }
+
+ (void) nvlist_lookup_string(topo_hcl[n1 - 1], FM_FMRI_HC_NAME, &name);
+ if (strcmp(name, "chip") != 0) {
+ nvlist_free(rsc);
+ return (TOPO_WALK_NEXT);
+ }
+
+ (void) nvlist_lookup_nvlist_array(nvl, FM_FMRI_HC_LIST, &hcl, &n2);
+
+ if (n1 != n2) {
+ nvlist_free(rsc);
+ return (TOPO_WALK_NEXT);
+ }
+
+ for (i = 0; i < n1; i++) {
+ (void) nvlist_lookup_string(topo_hcl[i], FM_FMRI_HC_NAME,
+ &name1);
+ (void) nvlist_lookup_string(topo_hcl[i], FM_FMRI_HC_ID, &id1);
+ (void) nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, &name2);
+ (void) nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &id2);
+ if (strcmp(name1, name2) != 0 || strcmp(id1, id2) != 0) {
+ nvlist_free(rsc);
+ return (TOPO_WALK_NEXT);
+ }
+ }
+
+ (void) topo_node_fru(node, &fru, NULL, &err);
+ if (fru != NULL) {
+ (void) nvlist_dup(fru, &fru_nvl, NV_UNIQUE_NAME);
+ nvlist_free(fru);
+ }
+ nvlist_free(rsc);
+ return (TOPO_WALK_TERMINATE);
+}
+
+nvlist_t *
+gmem_find_fault_fru(fmd_hdl_t *hdl, nvlist_t *nvl) {
+ topo_hdl_t *thp;
+ topo_walk_t *twp;
+ int err;
+ fru_nvl = NULL;
+
+ if ((thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) == NULL)
+ return (NULL);
+
+ if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC,
+ find_fault_fru, nvl, &err)) == NULL) {
+ fmd_hdl_topo_rele(hdl, thp);
+ return (NULL);
+ }
+
+ (void) topo_walk_step(twp, TOPO_WALK_CHILD);
+ topo_walk_fini(twp);
+ fmd_hdl_topo_rele(hdl, thp);
+ return (fru_nvl);
+}
+
+/*
+ * fault the FRU of the common detector between two DIMMs
+ */
+void
+gmem_gen_datapath_fault(fmd_hdl_t *hdl, nvlist_t *det)
+{
+ char *name, *id;
+ nvlist_t **hcl1, **hcl;
+ uint_t n;
+ int i, j;
+ fmd_case_t *cp;
+ nvlist_t *fltlist, *rsrc;
+ nvlist_t *fru = NULL;
+
+ if (nvlist_lookup_nvlist_array(det, FM_FMRI_HC_LIST, &hcl1, &n) < 0)
+ return;
+
+ for (i = 0; i < n; i++) {
+ (void) nvlist_lookup_string(hcl1[i], FM_FMRI_HC_NAME, &name);
+ if (strcmp(name, "chip") == 0)
+ break;
+ }
+
+ n = i + 1;
+ hcl = fmd_hdl_zalloc(hdl, sizeof (nvlist_t *) * n, FMD_SLEEP);
+ if (hcl == NULL)
+ return;
+
+ for (i = 0; i < n; i++) {
+ (void) nvlist_alloc(&hcl[i],
+ NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE, 0);
+ }
+
+ for (i = 0, j = 0; i < n; i++) {
+ (void) nvlist_lookup_string(hcl1[i], FM_FMRI_HC_NAME, &name);
+ (void) nvlist_lookup_string(hcl1[i], FM_FMRI_HC_ID, &id);
+ (void) nvlist_add_string(hcl[j], FM_FMRI_HC_NAME, name);
+ (void) nvlist_add_string(hcl[j], FM_FMRI_HC_ID, id);
+ j++;
+ if (strcmp(name, "chip") == 0)
+ break;
+ }
+
+ if (nvlist_alloc(&rsrc, NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE, 0) != 0) {
+ for (i = 0; i < n; i++) {
+ if (hcl[i] != NULL)
+ nvlist_free(hcl[i]);
+ }
+ fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
+ }
+
+ if (nvlist_add_uint8(rsrc, FM_VERSION, FM_HC_SCHEME_VERSION) != 0 ||
+ nvlist_add_string(rsrc, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0 ||
+ nvlist_add_string(rsrc, FM_FMRI_HC_ROOT, "") != 0 ||
+ nvlist_add_uint32(rsrc, FM_FMRI_HC_LIST_SZ, n) != 0 ||
+ nvlist_add_nvlist_array(rsrc, FM_FMRI_HC_LIST, hcl, n) != 0) {
+ for (i = 0; i < n; i++) {
+ if (hcl[i] != NULL)
+ nvlist_free(hcl[i]);
+ }
+ fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
+ nvlist_free(rsrc);
+ }
+
+ fru = gmem_find_fault_fru(hdl, rsrc);
+ if (fru != NULL) {
+ cp = fmd_case_open(hdl, NULL);
+ fltlist = fmd_nvl_create_fault(hdl, "fault.memory.datapath",
+ 100, fru, fru, fru);
+ fmd_case_add_suspect(hdl, cp, fltlist);
+ fmd_case_solve(hdl, cp);
+ nvlist_free(fru);
+ }
+
+ for (i = 0; i < n; i++) {
+ if (hcl[i] != NULL)
+ nvlist_free(hcl[i]);
+ }
+
+ fmd_hdl_free(hdl, hcl, sizeof (nvlist_t *) * n);
+ nvlist_free(rsrc);
+}
+
+/*
+ * formula to conver an unhashed address to hashed address
+ * PA[17:11] = (PA[32:28] xor PA[17:13]) :: ((PA[19:18] xor PA[12:11])
+ */
+static void
+gmem_to_hashed_addr(uint64_t *addr, uint64_t afar)
+{
+
+ *addr = (afar & OFFBIT) | ((afar & BIT28_32) >> 15) ^ (afar & BIT13_17)
+ | ((afar & BIT18_19) >> 7) ^ (afar & BIT11_12);
+}
+
+/*
+ * check if a dimm has n CEs that have the same symbol-in-error
+ */
+int
+upos_thresh_check(gmem_dimm_t *dimm, uint16_t upos, uint32_t threshold)
+{
+ int i;
+ gmem_mq_t *ip, *next;
+ int count = 0;
+
+ for (i = 0; i < GMEM_MAX_CKWDS; i++) {
+ for (ip = gmem_list_next(&dimm->mq_root[i]); ip != NULL;
+ ip = next) {
+ next = gmem_list_next(ip);
+ if (ip->mq_unit_position == upos) {
+ count++;
+ if (count >= threshold)
+ return (1);
+ }
+ }
+ }
+ return (0);
+}
+
+/*
+ * check if smaller number of retired pages > 1/16 of larger number of
+ * retired pages
+ */
+int
+check_bad_rw_retired_pages(fmd_hdl_t *hdl, gmem_dimm_t *d1, gmem_dimm_t *d2)
+{
+ uint_t sret, lret;
+ double ratio;
+
+ sret = lret = 0;
+
+ if (d2->dimm_nretired < d1->dimm_nretired) {
+ sret = d2->dimm_nretired;
+ lret = d1->dimm_nretired;
+ } else if (d2->dimm_nretired > d1->dimm_nretired) {
+ sret = d1->dimm_nretired;
+ lret = d2->dimm_nretired;
+ } else
+ return (0);
+
+ ratio = lret * GMEM_MQ_RATIO;
+
+ if (sret > ratio) {
+ fmd_hdl_debug(hdl, "sret=%d lret=%d ratio=%.3f",
+ sret, lret, ratio);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * check bad rw on any two DIMMs. The check succeeds if
+ * - each DIMM has a n CEs which have the same symbol-in-error,
+ * - the smaller number of retired pages > 1/16 larger number of retired pages
+ */
+static int
+check_bad_rw_between_dimms(fmd_hdl_t *hdl, gmem_dimm_t *d1, gmem_dimm_t *d2,
+ uint16_t *rupos)
+{
+ int i;
+ gmem_mq_t *ip, *next;
+ uint16_t upos;
+
+ for (i = 0; i < GMEM_MAX_CKWDS; i++) {
+ for (ip = gmem_list_next(&d1->mq_root[i]); ip != NULL;
+ ip = next) {
+ next = gmem_list_next(ip);
+ upos = ip->mq_unit_position;
+ if (upos_thresh_check(d1, upos, gmem.gm_nupos)) {
+ if (upos_thresh_check(d2, upos,
+ gmem.gm_nupos)) {
+ if (check_bad_rw_retired_pages(hdl,
+ d1, d2)) {
+ *rupos = upos;
+ return (1);
+ }
+ }
+ }
+ }
+ }
+
+ return (0);
+}
+
+static void
+bad_reader_writer_check(fmd_hdl_t *hdl, nvlist_t *det, gmem_dimm_t *ce_dimm)
+{
+ gmem_dimm_t *d, *next;
+ uint16_t upos;
+
+ for (d = gmem_list_next(&gmem.gm_dimms); d != NULL; d = next) {
+ next = gmem_list_next(d);
+ if (d == ce_dimm)
+ continue;
+ if (!gmem_same_datapath_dimms(hdl, ce_dimm, d))
+ continue;
+ if (check_bad_rw_between_dimms(hdl, ce_dimm, d, &upos)) {
+ gmem_gen_datapath_fault(hdl, det);
+ gmem_save_symbol_error(hdl, ce_dimm, upos);
+ fmd_hdl_debug(hdl,
+ "check_bad_rw_dimms succeeded: %s %s\n",
+ ce_dimm->dimm_serial, d->dimm_serial);
+ return;
+ }
+ }
+}
+
+/*
+ * rule 5a checking. The check succeeds if
+ * - nretired >= 512
+ * - nretired >= 128 and (addr_hi - addr_low) / (nretired -1 ) > 512KB
+ */
static void
ce_thresh_check(fmd_hdl_t *hdl, gmem_dimm_t *dimm)
{
+ nvlist_t *flt, *rsrc;
fmd_case_t *cp;
- nvlist_t *dflt, *rsc;
uint_t nret;
+ uint64_t delta_addr = 0;
- if (dimm->dimm_flags & GMEM_F_FAULTING) {
- /* We've already complained about this DIMM */
+ if (dimm->dimm_flags & GMEM_F_FAULTING)
return;
- }
nret = dimm->dimm_nretired;
- /*
- * fault the dimm if number retired page >= max_retired_pages
- */
- if (nret < gmem.gm_max_retired_pages)
+
+ if (nret < gmem.gm_low_ce_thresh)
return;
- dimm->dimm_flags |= GMEM_F_FAULTING;
- gmem_dimm_dirty(hdl, dimm);
+ if (dimm->dimm_phys_addr_hi >= dimm->dimm_phys_addr_low)
+ delta_addr =
+ (dimm->dimm_phys_addr_hi - dimm->dimm_phys_addr_low) /
+ (nret - 1);
- cp = fmd_case_open(hdl, NULL);
- rsc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);
- dflt = fmd_nvl_create_fault(hdl, GMEM_FAULT_DIMM_PAGES, GMEM_FLTMAXCONF,
- NULL, gmem_dimm_fru(dimm), rsc);
- fmd_case_add_suspect(hdl, cp, dflt);
- fmd_case_solve(hdl, cp);
- if (rsc != NULL)
- nvlist_free(rsc);
+ if (nret >= gmem.gm_max_retired_pages || delta_addr > GMEM_MQ_512KB) {
+
+ fmd_hdl_debug(hdl, "ce_thresh_check succeeded nret=%d", nret);
+ dimm->dimm_flags |= GMEM_F_FAULTING;
+ gmem_dimm_dirty(hdl, dimm);
+
+ cp = fmd_case_open(hdl, NULL);
+ rsrc = gmem_find_dimm_rsc(hdl, dimm->dimm_serial);
+ flt = fmd_nvl_create_fault(hdl, GMEM_FAULT_DIMM_PAGES,
+ GMEM_FLTMAXCONF, NULL, gmem_dimm_fru(dimm), rsrc);
+ fmd_case_add_suspect(hdl, cp, flt);
+ fmd_case_solve(hdl, cp);
+ if (rsrc != NULL)
+ nvlist_free(rsrc);
+ }
+}
+
+/*
+ * rule 5b checking. The check succeeds if more than 120
+ * non-intermittent CEs are reported against one symbol
+ * position of one afar in 72 hours
+ */
+static void
+mq_5b_check(fmd_hdl_t *hdl, gmem_dimm_t *dimm)
+{
+ nvlist_t *flt, *rsrc;
+ fmd_case_t *cp;
+ gmem_mq_t *ip, *next;
+ int cw;
+
+ for (cw = 0; cw < GMEM_MAX_CKWDS; cw++) {
+ for (ip = gmem_list_next(&dimm->mq_root[cw]);
+ ip != NULL; ip = next) {
+ next = gmem_list_next(ip);
+ if (ip->mq_dupce_count >= gmem.gm_dupce) {
+ fmd_hdl_debug(hdl,
+ "mq_5b_check succeeded: duplicate CE=%d",
+ ip->mq_dupce_count);
+ cp = fmd_case_open(hdl, NULL);
+ rsrc = gmem_find_dimm_rsc(hdl,
+ dimm->dimm_serial);
+ flt = fmd_nvl_create_fault(hdl,
+ GMEM_FAULT_DIMM_PAGES, GMEM_FLTMAXCONF,
+ NULL, gmem_dimm_fru(dimm), rsrc);
+ dimm->dimm_flags |= GMEM_F_FAULTING;
+ gmem_dimm_dirty(hdl, dimm);
+ fmd_case_add_suspect(hdl, cp, flt);
+ fmd_case_solve(hdl, cp);
+ if (rsrc != NULL)
+ nvlist_free(rsrc);
+ return;
+ }
+ }
+ }
+}
+
+/*
+ * delete the expired duplicate CE time stamps
+ */
+static void
+mq_prune_dup(fmd_hdl_t *hdl, gmem_mq_t *ip, uint64_t now)
+{
+ tstamp_t *tsp, *next;
+
+ for (tsp = gmem_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
+ tsp = next) {
+ next = gmem_list_next(tsp);
+ if (tsp->tstamp < now - GMEM_MQ_TIMELIM) {
+ gmem_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
+ fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
+ ip->mq_dupce_count--;
+ }
+ }
+}
+
+static void
+mq_update(fmd_hdl_t *hdl, fmd_event_t *ep, gmem_mq_t *ip, uint64_t now)
+{
+ tstamp_t *tsp;
+
+ ip->mq_tstamp = now;
+ ip->mq_ep = ep;
+ if (fmd_serd_exists(hdl, ip->mq_serdnm))
+ fmd_serd_destroy(hdl, ip->mq_serdnm);
+
+ fmd_serd_create(hdl, ip->mq_serdnm, GMEM_MQ_SERDN, GMEM_MQ_SERDT);
+ (void) fmd_serd_record(hdl, ip->mq_serdnm, ep);
+
+ tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP);
+ tsp->tstamp = now;
+ gmem_list_append(&ip->mq_dupce_tstamp, tsp);
+ ip->mq_dupce_count++;
}
/*
@@ -108,6 +489,8 @@ mq_create(fmd_hdl_t *hdl, fmd_event_t *ep,
uint64_t afar, uint16_t upos, uint16_t ckwd, uint64_t now)
{
gmem_mq_t *cp;
+ tstamp_t *tsp;
+
cp = fmd_hdl_zalloc(hdl, sizeof (gmem_mq_t), FMD_SLEEP);
cp->mq_tstamp = now;
cp->mq_ckwd = ckwd;
@@ -117,6 +500,11 @@ mq_create(fmd_hdl_t *hdl, fmd_event_t *ep,
cp->mq_serdnm =
gmem_mq_serdnm_create(hdl, "mq", afar, ckwd, upos);
+ tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP);
+ tsp->tstamp = now;
+ gmem_list_append(&cp->mq_dupce_tstamp, tsp);
+ cp->mq_dupce_count = 1;
+
/*
* Create SERD to keep this event from being removed
* by fmd which may not know there is an event pointer
@@ -135,6 +523,8 @@ gmem_mq_t *
mq_destroy(fmd_hdl_t *hdl, gmem_list_t *lp, gmem_mq_t *ip)
{
gmem_mq_t *jp = gmem_list_next(ip);
+ tstamp_t *tsp, *next;
+
if (ip->mq_serdnm != NULL) {
if (fmd_serd_exists(hdl, ip->mq_serdnm))
@@ -142,6 +532,14 @@ mq_destroy(fmd_hdl_t *hdl, gmem_list_t *lp, gmem_mq_t *ip)
fmd_hdl_strfree(hdl, ip->mq_serdnm);
ip->mq_serdnm = NULL;
}
+
+ for (tsp = gmem_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
+ tsp = next) {
+ next = gmem_list_next(tsp);
+ gmem_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
+ fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
+ }
+
gmem_list_delete(lp, &ip->mq_l);
fmd_hdl_free(hdl, ip, sizeof (gmem_mq_t));
@@ -172,12 +570,15 @@ mq_add(fmd_hdl_t *hdl, gmem_dimm_t *dimm, fmd_event_t *ep,
* Found a duplicate cw, unit_position, and afar.
* Delete this node, to be superseded by the new
* node added below.
+ * update the mq_t structure
*/
- ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
+ mq_update(hdl, ep, ip, now);
+ return;
} else {
ip = gmem_list_next(ip);
}
}
+
jp = mq_create(hdl, ep, afar, unit_position, cw, now);
if (ip == NULL)
gmem_list_append(&dimm->mq_root[cw], jp);
@@ -205,6 +606,7 @@ mq_prune(fmd_hdl_t *hdl, gmem_dimm_t *dimm, uint64_t now)
*/
ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
} else {
+ mq_prune_dup(hdl, ip, now);
/* tstamp < now - ce_t */
ip = gmem_list_next(ip);
}
@@ -317,18 +719,21 @@ gmem_evdisp_t
gmem_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
{
uint16_t symbol_pos, cw;
- uint64_t phyaddr, offset;
+ uint64_t phyaddr, offset, addr;
uint32_t filter_ratio = 0;
gmem_dimm_t *dimm;
gmem_page_t *page;
nvlist_t *fru = NULL;
nvlist_t *topo_rsc = NULL;
- nvlist_t *rsrc;
+ nvlist_t *rsrc, *det;
const char *uuid;
ce_dispact_t type;
boolean_t diagnose;
char *sn;
int err, rc;
+ uint64_t *now;
+ uint_t nelem;
+ int skip_error = 0;
err = nvlist_lookup_boolean_value(nvl, GMEM_ERPT_PAYLOAD_DIAGNOSE,
&diagnose);
@@ -359,6 +764,9 @@ gmem_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
fmd_hdl_debug(hdl, "serial %s", sn);
+ if (nvlist_lookup_nvlist(nvl, GMEM_ERPT_PAYLOAD_DETECTOR, &det) != 0)
+ return (GMEM_EVD_BAD);
+
/*
* Find dimm fru by serial number.
*/
@@ -390,19 +798,27 @@ gmem_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
&symbol_pos);
err |= nvlist_lookup_uint16(nvl, GMEM_ERPT_PAYLOAD_CKW, &cw);
- if (err == 0)
- fmd_hdl_debug(hdl, "symbol_pos=%d cw=%d",
- symbol_pos, cw);
+ if (err == 0) {
+ fmd_hdl_debug(hdl, "symbol_pos=%d cw=%d", symbol_pos, cw);
- if (!(dimm->dimm_flags & GMEM_F_FAULTING) && (err == 0)) {
- uint64_t *now;
- uint_t nelem;
if (nvlist_lookup_uint64_array(nvl,
"__tod", &now, &nelem) == 0) {
- mq_add(hdl, dimm, ep, phyaddr, symbol_pos,
- cw, *now);
+ skip_error = gmem_check_symbol_error(hdl, dimm,
+ symbol_pos);
+
+ if (!skip_error ||
+ !(dimm->dimm_flags & GMEM_F_FAULTING))
+ mq_add(hdl, dimm, ep, phyaddr, symbol_pos,
+ cw, *now);
+
mq_prune(hdl, dimm, *now);
- mq_check(hdl, dimm);
+
+ if (!skip_error)
+ bad_reader_writer_check(hdl, det, dimm);
+ if (!(dimm->dimm_flags & GMEM_F_FAULTING)) {
+ mq_check(hdl, dimm);
+ mq_5b_check(hdl, dimm);
+ }
}
}
@@ -428,6 +844,11 @@ gmem_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
return (GMEM_EVD_BAD);
}
+ if (gmem_check_symbol_error(hdl, dimm, symbol_pos)) {
+ nvlist_free(fru);
+ return (GMEM_EVD_REDUND);
+ }
+
if (page == NULL) {
page = gmem_page_create(hdl, fru, phyaddr, offset);
if (page == NULL) {
@@ -475,6 +896,13 @@ gmem_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
ep, phyaddr, offset);
if (rc) {
+ gmem_to_hashed_addr(&addr, phyaddr);
+
+ if (addr > dimm->dimm_phys_addr_hi)
+ dimm->dimm_phys_addr_hi = addr;
+ if (addr < dimm->dimm_phys_addr_low)
+ dimm->dimm_phys_addr_low = addr;
+
dimm->dimm_nretired++;
dimm->dimm_retstat.fmds_value.ui64++;
gmem_dimm_dirty(hdl, dimm);
diff --git a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_page.c b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_page.c
index 9a2de4bf2f..c453f86311 100644
--- a/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_page.c
+++ b/usr/src/cmd/fm/modules/sun4v/generic-mem/gmem_page.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
@@ -327,6 +326,8 @@ gmem_page_fault(fmd_hdl_t *hdl, nvlist_t *fru, nvlist_t *rsc,
if (page != NULL) {
if (page->page_flags & GMEM_F_FAULTING ||
gmem_page_unusable(hdl, page)) {
+ if (rsc != NULL)
+ nvlist_free(rsc);
page->page_flags |= GMEM_F_FAULTING;
return (0);
}