diff options
Diffstat (limited to 'usr/src/uts/sun4u/cpu')
-rw-r--r-- | usr/src/uts/sun4u/cpu/spitfire.c | 77 | ||||
-rw-r--r-- | usr/src/uts/sun4u/cpu/us3_common.c | 30 | ||||
-rw-r--r-- | usr/src/uts/sun4u/cpu/us3_jalapeno.c | 20 |
3 files changed, 37 insertions, 90 deletions
diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c index 35d97e6b23..75f57ce39c 100644 --- a/usr/src/uts/sun4u/cpu/spitfire.c +++ b/usr/src/uts/sun4u/cpu/spitfire.c @@ -432,6 +432,7 @@ void cpu_setup(void) { extern int page_retire_messages; + extern int page_retire_first_ue; extern int at_flags; #if defined(SF_ERRATA_57) extern caddr_t errata57_limit; @@ -445,9 +446,11 @@ cpu_setup(void) /* * Spitfire isn't currently FMA-aware, so we have to enable the - * page retirement messages. + * page retirement messages. We also change the default policy + * for UE retirement to allow clearing of transient errors. */ page_retire_messages = 1; + page_retire_first_ue = 0; /* * save the cache bootup state. @@ -895,10 +898,7 @@ cpu_ce_error(struct regs *rp, ulong_t p_afar, ulong_t p_afsr, curthread->t_ontrap != NULL) { if (curthread->t_ontrap->ot_prot & OT_DATA_EC) { - page_t *pp = page_numtopp_nolock((pfn_t) - (ecc->flt_addr >> MMU_PAGESHIFT)); - - if (pp != NULL && page_isretired(pp)) { + if (page_retire_check(ecc->flt_addr, NULL) == 0) { queue = 0; } } @@ -1093,6 +1093,7 @@ cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep) char unum[UNUM_NAMLEN]; int len = 0; int ce_verbose = 0; + int err; ASSERT(ecc->flt_func != NULL); @@ -1107,15 +1108,9 @@ cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep) * Count errors per unum. * Non-memory errors are all counted via a special unum string. */ - if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING && + if ((err = ce_count_unum(ecc->flt_status, len, unum)) != PR_OK && automatic_page_removal) { - page_t *pp = page_numtopp_nolock((pfn_t) - (ecc->flt_addr >> MMU_PAGESHIFT)); - - if (pp) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_FAILING); - } + (void) page_retire(ecc->flt_addr, err); } if (ecc->flt_panic) { @@ -2092,11 +2087,7 @@ cpu_async_log_err(void *flt) if (!panicstr && (aflt->flt_stat & S_AFSR_ALL_ERRS) == P_AFSR_UE && aflt->flt_prot == AFLT_PROT_EC) { - page_t *pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - - if (pp != NULL && page_isretired(pp)) { - + if (page_retire_check(aflt->flt_addr, NULL) == 0) { /* Zero the address to clear the error */ softcall(ecc_page_zero, (void *)aflt->flt_addr); return; @@ -2305,25 +2296,7 @@ cpu_async_log_err(void *flt) if (aflt->flt_addr != AFLT_INV_ADDR && aflt->flt_in_memory) { if (!panicstr) { - /* - * Retire the bad page that caused the error - */ - page_t *pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - - if (pp != NULL) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_TOXIC); - } else { - uint64_t pa = - P2ALIGN(aflt->flt_addr, MMU_PAGESIZE); - - cpu_aflt_log(CE_CONT, 3, spf_flt, - CPU_ERRID_FIRST, NULL, - ": cannot schedule clearing of error on " - "page 0x%08x.%08x; page not in VM system", - (uint32_t)(pa >> 32), (uint32_t)pa); - } + (void) page_retire(aflt->flt_addr, PR_UE); } else { /* * Clear UEs on panic so that we don't @@ -4089,12 +4062,7 @@ static void ecache_page_retire(void *arg) { uint64_t paddr = (uint64_t)arg; - page_t *pp = page_numtopp_nolock((pfn_t)(paddr >> MMU_PAGESHIFT)); - - if (pp) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_TOXIC); - } + (void) page_retire(paddr, PR_UE); } void @@ -4331,15 +4299,14 @@ add_leaky_bucket_timeout(void) * false intermittents, so these intermittents can be safely ignored. * * If the error count is excessive for a DIMM, this function will return - * PAGE_IS_FAILING, and the CPU module may then decide to remove that page - * from use. + * PR_MCE, and the CPU module may then decide to remove that page from use. */ static int ce_count_unum(int status, int len, char *unum) { int i; struct ce_info *psimm = mem_ce_simm; - int page_status = PAGE_IS_OK; + int page_status = PR_OK; ASSERT(psimm != NULL); @@ -4375,7 +4342,7 @@ ce_count_unum(int status, int len, char *unum) cmn_err(CE_WARN, "[AFT0] Sticky Softerror encountered " "on Memory Module %s\n", unum); - page_status = PAGE_IS_FAILING; + page_status = PR_MCE; } else if (status & ECC_PERSISTENT) { psimm[i].leaky_bucket_cnt = 1; psimm[i].intermittent_total = 0; @@ -4404,7 +4371,7 @@ ce_count_unum(int status, int len, char *unum) cmn_err(CE_WARN, "[AFT0] Sticky Softerror encountered " "on Memory Module %s\n", unum); - page_status = PAGE_IS_FAILING; + page_status = PR_MCE; } else if (status & ECC_PERSISTENT) { int new_value; @@ -4422,7 +4389,7 @@ ce_count_unum(int status, int len, char *unum) ecc_softerr_interval % 60); atomic_add_16( &psimm[i].leaky_bucket_cnt, -1); - page_status = PAGE_IS_FAILING; + page_status = PR_MCE; } } else { /* Intermittent */ psimm[i].intermittent_total++; @@ -4444,15 +4411,11 @@ ce_count_unum(int status, int len, char *unum) void cpu_ce_count_unum(struct async_flt *ecc, int len, char *unum) { - if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING && - automatic_page_removal) { - page_t *pp = page_numtopp_nolock((pfn_t) - (ecc->flt_addr >> MMU_PAGESHIFT)); + int err; - if (pp) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_FAILING); - } + err = ce_count_unum(ecc->flt_status, len, unum); + if (err != PR_OK && automatic_page_removal) { + (void) page_retire(ecc->flt_addr, err); } } diff --git a/usr/src/uts/sun4u/cpu/us3_common.c b/usr/src/uts/sun4u/cpu/us3_common.c index f8d8b2bb77..f7cc35c664 100644 --- a/usr/src/uts/sun4u/cpu/us3_common.c +++ b/usr/src/uts/sun4u/cpu/us3_common.c @@ -2205,7 +2205,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) { ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt; struct async_flt *aflt = (struct async_flt *)flt; - page_t *pp; + uint64_t errors; switch (ch_flt->flt_type) { case CPU_INV_AFSR: @@ -2236,9 +2236,6 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) */ case CPU_CE: case CPU_EMC: - pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - /* * We want to skip logging and further classification * only if ALL the following conditions are true: @@ -2258,7 +2255,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE && aflt->flt_prot == AFLT_PROT_EC) { - if (pp != NULL && page_isretired(pp)) { + if (page_retire_check(aflt->flt_addr, NULL) == 0) { if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) { /* @@ -2289,17 +2286,17 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) * * Note: Check cpu_impl_async_log_err if changing this */ - if (pp) { - if (page_isretired(pp) || page_deteriorating(pp)) { + if (page_retire_check(aflt->flt_addr, &errors) == EINVAL) { + CE_XDIAG_SETSKIPCODE(aflt->flt_disp, + CE_XDIAG_SKIP_NOPP); + } else { + if (errors != PR_OK) { CE_XDIAG_SETSKIPCODE(aflt->flt_disp, CE_XDIAG_SKIP_PAGEDET); } else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep, offsetof(ch_async_flt_t, cmn_asyncflt))) { return (0); } - } else { - CE_XDIAG_SETSKIPCODE(aflt->flt_disp, - CE_XDIAG_SKIP_NOPP); } /*FALLTHRU*/ @@ -2325,11 +2322,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) if (!panicstr && (ch_flt->afsr_errs & (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_UE && aflt->flt_prot == AFLT_PROT_EC) { - page_t *pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - - if (pp != NULL && page_isretired(pp)) { - + if (page_retire_check(aflt->flt_addr, NULL) == 0) { /* Zero the address to clear the error */ softcall(ecc_page_zero, (void *)aflt->flt_addr); return (0); @@ -2387,12 +2380,7 @@ void cpu_page_retire(ch_async_flt_t *ch_flt) { struct async_flt *aflt = (struct async_flt *)ch_flt; - page_t *pp = page_numtopp_nolock(aflt->flt_addr >> MMU_PAGESHIFT); - - if (pp != NULL) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_TOXIC); - } + (void) page_retire(aflt->flt_addr, PR_UE); } /* diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno.c b/usr/src/uts/sun4u/cpu/us3_jalapeno.c index cd71848200..0b7936d426 100644 --- a/usr/src/uts/sun4u/cpu/us3_jalapeno.c +++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c @@ -316,7 +316,7 @@ cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep) { ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt; struct async_flt *aflt = (struct async_flt *)flt; - page_t *pp; + uint64_t errors; switch (ch_flt->flt_type) { @@ -329,19 +329,15 @@ cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep) return (CH_ASYNC_LOG_DONE); case CPU_RCE: - pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - if (pp) { - if (page_isretired(pp) || page_deteriorating(pp)) { - CE_XDIAG_SETSKIPCODE(aflt->flt_disp, - CE_XDIAG_SKIP_PAGEDET); - } else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep, - offsetof(ch_async_flt_t, cmn_asyncflt))) { - return (CH_ASYNC_LOG_RECIRC); - } - } else { + if (page_retire_check(aflt->flt_addr, &errors) == EINVAL) { CE_XDIAG_SETSKIPCODE(aflt->flt_disp, CE_XDIAG_SKIP_NOPP); + } else if (errors != PR_OK) { + CE_XDIAG_SETSKIPCODE(aflt->flt_disp, + CE_XDIAG_SKIP_PAGEDET); + } else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep, + offsetof(ch_async_flt_t, cmn_asyncflt))) { + return (CH_ASYNC_LOG_RECIRC); } /*FALLTHRU*/ /* |