diff options
-rw-r--r-- | usr/src/uts/common/Makefile.files | 1 | ||||
-rw-r--r-- | usr/src/uts/common/io/mem.c | 63 | ||||
-rw-r--r-- | usr/src/uts/common/os/mem_config.c | 67 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mem.h | 20 | ||||
-rw-r--r-- | usr/src/uts/common/vm/page.h | 110 | ||||
-rw-r--r-- | usr/src/uts/common/vm/page_lock.c | 198 | ||||
-rw-r--r-- | usr/src/uts/common/vm/page_retire.c | 1473 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_page.c | 850 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_pagelist.c | 20 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/machdep.c | 2 | ||||
-rw-r--r-- | usr/src/uts/sun4u/cpu/spitfire.c | 77 | ||||
-rw-r--r-- | usr/src/uts/sun4u/cpu/us3_common.c | 30 | ||||
-rw-r--r-- | usr/src/uts/sun4u/cpu/us3_jalapeno.c | 20 | ||||
-rw-r--r-- | usr/src/uts/sun4u/io/pci/pci_ecc.c | 24 | ||||
-rw-r--r-- | usr/src/uts/sun4u/ngdr/io/dr_mem.c | 68 | ||||
-rw-r--r-- | usr/src/uts/sun4u/os/ecc.c | 10 | ||||
-rw-r--r-- | usr/src/uts/sun4u/os/mach_cpu_states.c | 26 | ||||
-rw-r--r-- | usr/src/uts/sun4v/os/error.c | 51 | ||||
-rw-r--r-- | usr/src/uts/sun4v/os/mach_cpu_states.c | 26 |
19 files changed, 1986 insertions, 1150 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 57b3f1968f..32c38cdac3 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -51,6 +51,7 @@ COMMON_CORE_OBJS += \ lgrp_topo.o \ mutex.o \ page_lock.o \ + page_retire.o \ panic.o \ param.o \ putnext.o \ diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c index 3aa34f9427..1e42907a5d 100644 --- a/usr/src/uts/common/io/mem.c +++ b/usr/src/uts/common/io/mem.c @@ -460,47 +460,51 @@ mmioctl_vtop(intptr_t data) } /* - * Given a PA, retire that page or check whether it has already been retired. + * Given a PA, execute the given page retire command on it. */ static int mmioctl_page_retire(int cmd, intptr_t data) { + extern int page_retire_test(void); uint64_t pa; - pfn_t pfn; - page_t *pp; - if (copyin((void *)data, &pa, sizeof (uint64_t))) + if (copyin((void *)data, &pa, sizeof (uint64_t))) { return (EFAULT); + } - pfn = pa >> MMU_PAGESHIFT; + switch (cmd) { + case MEM_PAGE_ISRETIRED: + return (page_retire_check(pa, NULL)); - if (!pf_is_memory(pfn) || (pp = page_numtopp_nolock(pfn)) == NULL) - return (EINVAL); + case MEM_PAGE_UNRETIRE: + return (page_unretire(pa)); - /* - * If we're checking, see if the page is retired; if not, confirm that - * its status is at least set to be failing. If neither, return EIO. - */ - if (cmd == MEM_PAGE_ISRETIRED) { - if (page_isretired(pp)) - return (0); + case MEM_PAGE_RETIRE: + return (page_retire(pa, PR_FMA)); - if (!page_isfailing(pp)) - return (EIO); + case MEM_PAGE_RETIRE_MCE: + return (page_retire(pa, PR_MCE)); - return (EAGAIN); - } + case MEM_PAGE_RETIRE_UE: + return (page_retire(pa, PR_UE)); - /* - * Try to retire the page. If the retire fails, it will be scheduled to - * occur when the page is freed. If this page is out of circulation - * already, or is in the process of being retired, we fail. - */ - if (page_isretired(pp) || page_isfailing(pp)) - return (EIO); + case MEM_PAGE_GETERRORS: + { + uint64_t page_errors; + int rc = page_retire_check(pa, &page_errors); + if (copyout(&page_errors, (void *)data, + sizeof (uint64_t))) { + return (EFAULT); + } + return (rc); + } + + case MEM_PAGE_RETIRE_TEST: + return (page_retire_test()); + + } - page_settoxic(pp, PAGE_IS_FAULTY); - return (page_retire(pp, PAGE_IS_FAILING) ? EAGAIN : 0); + return (EINVAL); } #ifdef __sparc @@ -606,6 +610,11 @@ mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp) case MEM_PAGE_RETIRE: case MEM_PAGE_ISRETIRED: + case MEM_PAGE_UNRETIRE: + case MEM_PAGE_RETIRE_MCE: + case MEM_PAGE_RETIRE_UE: + case MEM_PAGE_GETERRORS: + case MEM_PAGE_RETIRE_TEST: if (getminor(dev) != M_MEM) return (ENXIO); return (mmioctl_page_retire(cmd, data)); diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c index 0d29cc59d6..8f398ac602 100644 --- a/usr/src/uts/common/os/mem_config.c +++ b/usr/src/uts/common/os/mem_config.c @@ -1770,31 +1770,13 @@ delete_memory_thread(caddr_t amhp) } if (!page_try_reclaim_lock(pp, SE_EXCL, - SE_EXCL_WANTED)) { - if (page_isretired(pp)) { - /* - * Page has been retired. - * - * Its shared lock can and - * must be upgraded to an - * exclusive lock in order - * to hashout the page when - * the delete completes. - */ - page_lock_clr_exclwanted(pp); - if (!page_tryupgrade(pp)) { - mutex_enter( - &mhp->mh_mutex); - continue; - } - } else { - /* - * Page in use elsewhere. - */ - MDSTAT_INCR(mhp, lockfail); - mutex_enter(&mhp->mh_mutex); - continue; - } + SE_EXCL_WANTED | SE_RETIRED)) { + /* + * Page in use elsewhere. Skip it. + */ + MDSTAT_INCR(mhp, lockfail); + mutex_enter(&mhp->mh_mutex); + continue; } /* * See if the cage expanded into the delete. @@ -1802,15 +1784,12 @@ delete_memory_thread(caddr_t amhp) * cage to expand. */ if (PP_ISNORELOC(pp)) { - if (page_isretired(pp)) - page_downgrade(pp); - else - page_unlock(pp); + page_unlock(pp); mutex_enter(&mhp->mh_mutex); mhp->mh_cancel = KPHYSM_ENONRELOC; break; } - if (page_isretired(pp)) { + if (PP_RETIRED(pp)) { /* * Page has been retired and is * not part of the cage so we @@ -1861,11 +1840,11 @@ delete_memory_thread(caddr_t amhp) } /* * Keep stats on pages encountered that - * are toxic or failing but not retired. + * are marked for retirement. */ - if (page_istoxic(pp)) { + if (PP_TOXIC(pp)) { MDSTAT_INCR(mhp, toxic); - } else if (page_isfailing(pp)) { + } else if (PP_PR_REQ(pp)) { MDSTAT_INCR(mhp, failing); } /* @@ -1876,7 +1855,7 @@ delete_memory_thread(caddr_t amhp) * previously associated with the page. */ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { - if (!page_istoxic(pp)) { + if (!PP_TOXIC(pp)) { /* * Must relocate locked in * memory pages. @@ -1949,7 +1928,7 @@ delete_memory_thread(caddr_t amhp) #ifdef MEM_DEL_STATS start_pgrp = ddi_get_lbolt(); #endif /* MEM_DEL_STATS */ - if (mod && !page_istoxic(pp)) { + if (mod && !PP_TOXIC(pp)) { /* * Lock all constituent pages * of a large page to ensure @@ -2020,7 +1999,7 @@ delete_memory_thread(caddr_t amhp) * set, we cannot do anything here to deal * with it. */ - if (page_istoxic(pp)) { + if (PP_TOXIC(pp)) { page_unlock(pp); #ifdef MEM_DEL_STATS ntick_pgrp = (uint64_t)ddi_get_lbolt() - @@ -2067,7 +2046,7 @@ delete_memory_thread(caddr_t amhp) continue; } if (page_try_reclaim_lock(pp, SE_EXCL, - SE_EXCL_WANTED)) { + SE_EXCL_WANTED | SE_RETIRED)) { if (PP_ISFREE(pp)) { goto free_page_collect; } @@ -2229,12 +2208,8 @@ delete_memory_thread(caddr_t amhp) /* * If the memory delete was cancelled, exclusive-wanted bits must - * be cleared, and also any retired pages that - * were accounted for above must have their exclusive lock - * downgraded to a shared lock to return them to their previous - * state. - * Otherwise, if the memory delete has completed, retired pages - * must be hashed out. + * be cleared. If there are retired pages being deleted, they need + * to be unretired. */ for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; mdsp = mdsp->mds_next) { @@ -2264,16 +2239,16 @@ delete_memory_thread(caddr_t amhp) pp = page_numtopp_nolock(pfn); } ASSERT(pp != NULL); - ASSERT(page_isretired(pp)); + ASSERT(PP_RETIRED(pp)); if (mhp->mh_cancel != 0) { - page_downgrade(pp); + page_unlock(pp); /* * To satisfy ASSERT below in * cancel code. */ mhp->mh_hold_todo++; } else { - page_hashout(pp, (kmutex_t *)NULL); + (void) page_unretire_pp(pp, 0); } } } diff --git a/usr/src/uts/common/sys/mem.h b/usr/src/uts/common/sys/mem.h index e741d56b9f..f2b23b8029 100644 --- a/usr/src/uts/common/sys/mem.h +++ b/usr/src/uts/common/sys/mem.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,11 +60,25 @@ typedef struct mem_vtop { * and drivers should not make use of these interfaces: they can change without * notice and programs that consume them will fail to run on future releases. */ -#define MEM_PAGE_RETIRE (('M' << 8) | 0x02) -#define MEM_PAGE_ISRETIRED (('M' << 8) | 0x03) #define MEM_NAME (('M' << 8) | 0x04) #define MEM_INFO (('M' << 8) | 0x05) +#define MEM_PAGE_RETIRE (('M' << 8) | 0x02) +#define MEM_PAGE_ISRETIRED (('M' << 8) | 0x03) +#define MEM_PAGE_UNRETIRE (('M' << 8) | 0x06) +#define MEM_PAGE_GETERRORS (('M' << 8) | 0x07) +#define MEM_PAGE_RETIRE_MCE (('M' << 8) | 0x08) +#define MEM_PAGE_RETIRE_UE (('M' << 8) | 0x09) +#define MEM_PAGE_RETIRE_TEST (('M' << 8) | 0x0A) + +/* + * Bits returned from MEM_PAGE_GETERRORS ioctl for use by fmd(1M). + */ +#define MEM_PAGE_ERR_NONE 0x0 +#define MEM_PAGE_ERR_MULTI_CE 0x1 +#define MEM_PAGE_ERR_UE 0x2 +#define MEM_PAGE_ERR_FMA_REQ 0x8 + typedef struct mem_name { uint64_t m_addr; /* memory address */ uint64_t m_synd; /* architecture-specific syndrome */ diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index 2e4183bdc0..c1db6f1391 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -76,6 +76,12 @@ typedef enum { */ #define SE_EXCL_WANTED 0x02 +/* + * All page_*lock() requests will be denied unless this flag is set in + * the 'es' parameter. + */ +#define SE_RETIRED 0x04 + #endif /* _KERNEL | _KMEMUSER */ typedef int selock_t; @@ -630,37 +636,6 @@ struct lgrp; #define PG_LIST_ISCAGE 0x2000 /* - * Flags for setting the p_toxic flag when a page has errors - * These flags may be OR'ed into the p_toxic page flag to - * indicate that error(s) have occurred on a page, - * (see page_settoxic()). If both PAGE_IS_TOXIC and - * PAGE_IS_FAILING are set, PAGE_IS_FAILING takes precedence. - * - * When an error happens on a page, the trap handler sets - * PAGE_IS_FAULTY on the page to indicate that an error has been - * seen on the page. The error could be really a memory error or - * something else (like a datapath error). When it is determined - * that it is a memory error, the page is marked as PAGE_IS_TOXIC - * or PAGE_IS_FAILING depending on the type of error and then - * retired. - * - * We use the page's 'toxic' flag to determine whether the page - * has just got a single error - PAGE_IS_TOXIC - or is being - * retired due to multiple soft errors - PAGE_IS_FAILING. In - * page_free(), a page that has been marked PAGE_IS_FAILING will - * not be cleaned, it will always be retired. A page marked - * PAGE_IS_TOXIC is cleaned and is retired only if this attempt at - * cleaning fails. - * - * When a page has been successfully retired, we set PAGE_IS_RETIRED. - */ -#define PAGE_IS_OK 0x0 -#define PAGE_IS_TOXIC 0x1 -#define PAGE_IS_FAILING 0x2 -#define PAGE_IS_RETIRED 0x4 -#define PAGE_IS_FAULTY 0x8 - -/* * Page frame operations. */ page_t *page_lookup(struct vnode *, u_offset_t, se_t); @@ -707,6 +682,7 @@ void page_boot_demote(page_t *); void page_promote_size(page_t *, uint_t); void page_list_add_pages(page_t *, int); void page_list_sub(page_t *, int); +void page_list_sub_pages(page_t *, uint_t); void page_list_xfer(page_t *, int, int); void page_list_break(page_t **, page_t **, size_t); void page_list_concat(page_t **, page_t **); @@ -720,6 +696,7 @@ int page_try_reclaim_lock(page_t *, se_t, int); int page_tryupgrade(page_t *); void page_downgrade(page_t *); void page_unlock(page_t *); +void page_unlock_noretire(page_t *); void page_lock_delete(page_t *); int page_pp_lock(page_t *, int, int); void page_pp_unlock(page_t *, int, int); @@ -759,19 +736,22 @@ int page_isfree(page_t *); int page_isref(page_t *); int page_ismod(page_t *); int page_release(page_t *, int); -int page_retire(page_t *, uchar_t); -int page_istoxic(page_t *); -int page_isfailing(page_t *); -int page_isretired(page_t *); -int page_deteriorating(page_t *); +void page_retire_init(void); +int page_retire(uint64_t, uchar_t); +int page_retire_check(uint64_t, uint64_t *); +int page_unretire(uint64_t); +int page_unretire_pp(page_t *, int); +void page_tryretire(page_t *); +void page_retire_hunt(void (*)(page_t *)); +void page_retire_mdboot_cb(page_t *); +void page_clrtoxic(page_t *, uchar_t); void page_settoxic(page_t *, uchar_t); -void page_clrtoxic(page_t *); -void page_clrtoxic_flag(page_t *, uchar_t); -int page_isfaulty(page_t *); + int page_mem_avail(pgcnt_t); void page_set_props(page_t *, uint_t); void page_clr_all_props(page_t *); +int page_clear_lck_cow(page_t *, int); kmutex_t *page_vnode_mutex(struct vnode *); kmutex_t *page_se_mutex(struct page *); @@ -792,6 +772,7 @@ void page_free_replacement_page(page_t *); int page_relocate_cage(page_t **, page_t **); int page_try_demote_pages(page_t *); +int page_try_demote_free_pages(page_t *); void page_demote_free_pages(page_t *); struct anon_map; @@ -879,7 +860,56 @@ int page_szc_user_filtered(size_t); #define PP_CLRMIGRATE(pp) ((pp)->p_state &= ~P_MIGRATE) #define PP_CLRSWAP(pp) ((pp)->p_state &= ~P_SWAP) - +/* + * Flags for page_t p_toxic, for tracking memory hardware errors. + * + * These flags are OR'ed into p_toxic with page_settoxic() to track which + * error(s) have occurred on a given page. The flags are cleared with + * page_clrtoxic(). Both page_settoxic() and page_cleartoxic use atomic + * primitives to manipulate the p_toxic field so no other locking is needed. + * + * When an error occurs on a page, p_toxic is set to record the error. The + * error could be a memory error or something else (i.e. a datapath). The Page + * Retire mechanism does not try to determine the exact cause of the error; + * Page Retire rightly leaves that sort of determination to FMA's Diagnostic + * Engine (DE). + * + * Note that, while p_toxic bits can be set without holding any locks, they + * should only be cleared while holding the page exclusively locked. + * + * Pages with PR_UE or PR_FMA flags are retired unconditionally, while pages + * with PR_MCE are retired if the system has not retired too many of them. + * + * A page must be exclusively locked to be retired. Pages can be retired if + * they are mapped, modified, or both, as long as they are not marked PR_UE, + * since pages with uncorrectable errors cannot be relocated in memory. + * Once a page has been successfully retired it is zeroed, attached to the + * retired_pages vnode and, finally, PR_RETIRED is set in p_toxic. The other + * p_toxic bits are NOT cleared. Pages are not left locked after retiring them + * to avoid special case code throughout the kernel; rather, page_*lock() will + * fail to lock the page, unless SE_RETIRED is passed as an argument. + * + * While we have your attention, go take a look at the comments at the + * beginning of page_retire.c too. + */ +#define PR_OK 0x00 /* no problem */ +#define PR_MCE 0x01 /* page has seen two or more CEs */ +#define PR_UE 0x02 /* page has an unhandled UE */ +#define PR_UE_SCRUBBED 0x04 /* page has seen a UE but was cleaned */ +#define PR_FMA 0x08 /* A DE wants this page retired */ +#define PR_RESV 0x10 /* Reserved for future use */ +#define PR_BUSY 0x20 /* Page retire is in progress */ +#define PR_MSG 0x40 /* message(s) already printed for this page */ +#define PR_RETIRED 0x80 /* This page has been retired */ + +#define PR_REASONS (PR_UE | PR_MCE | PR_FMA) +#define PR_TOXIC (PR_UE) +#define PR_ERRMASK (PR_UE | PR_UE_SCRUBBED | PR_MCE | PR_FMA) +#define PR_ALLFLAGS (0xFF) + +#define PP_RETIRED(pp) ((pp)->p_toxic & PR_RETIRED) +#define PP_TOXIC(pp) ((pp)->p_toxic & PR_TOXIC) +#define PP_PR_REQ(pp) (((pp)->p_toxic & PR_REASONS) && !PP_RETIRED(pp)) /* * kpm large page description. diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c index 9a2d12dd8e..d34f7b2737 100644 --- a/usr/src/uts/common/vm/page_lock.c +++ b/usr/src/uts/common/vm/page_lock.c @@ -189,16 +189,17 @@ uint_t page_lock_reclaim; uint_t page_lock_bad_reclaim; uint_t page_lock_same_page; uint_t page_lock_upgrade; +uint_t page_lock_retired; uint_t page_lock_upgrade_failed; uint_t page_lock_deleted; uint_t page_trylock_locked; +uint_t page_trylock_failed; uint_t page_trylock_missed; uint_t page_try_reclaim_upgrade; #endif /* VM_STATS */ - /* * Acquire the "shared/exclusive" lock on a page. * @@ -222,27 +223,47 @@ page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) * callers wanting an exclusive (writer) lock may prevent shared-lock * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. * In this case, when an exclusive lock cannot be acquired, p_selock's - * SE_EWANTED bit is set. - * This bit, along with the se and es parameters, are used to decide - * if the requested lock should be granted: + * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied + * if the page is slated for retirement. + * + * The se and es parameters determine if the lock should be granted + * based on the following decision table: + * + * Lock wanted es flags p_selock/SE_EWANTED Action + * ----------- -------------- ------------------- --------- + * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED + * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED + * SE_EXCL none any lock/any deny + * SE_SHARED n/a [2][3] shared/0 grant + * SE_SHARED n/a [2][3] unlocked/0 grant + * SE_SHARED n/a shared/1 deny + * SE_SHARED n/a unlocked/1 deny + * SE_SHARED n/a excl/any deny + * + * Notes: + * [1] The code grants an exclusive lock to the caller and clears the bit + * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED + * bit's value. This was deemed acceptable as we are not concerned about + * exclusive-lock starvation. If this ever becomes an issue, a priority or + * fifo mechanism should also be implemented. Meantime, the thread that + * set SE_EWANTED should be prepared to catch this condition and reset it + * + * [2] Retired pages may not be locked at any time, regardless of the + * dispostion of se, unless the es parameter has SE_RETIRED flag set. * - * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED Action - * ---------- -------------- ------------------- --------- - * SE_EXCL no dont-care/1 deny lock - * SE_EXCL any(see note) unlocked/any grant lock, clear SE_EWANTED - * SE_EXCL yes any lock/any deny, set SE_EWANTED - * SE_EXCL no any lock/any deny - * SE_SHARED not applicable shared/0 grant - * SE_SHARED not applicable unlocked/0 grant - * SE_SHARED not applicable shared/1 deny - * SE_SHARED not applicable unlocked/1 deny - * SE_SHARED not applicable excl/any deny + * [3] If the page is slated for retirement the lock is denied. * - * Note: the code grants an exclusive lock to the caller and clears - * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED - * bit's value. This was deemed acceptable as we are not concerned about - * exclusive-lock starvation. If this ever becomes an issue, a priority or - * fifo mechanism should also be implemented. + * Notes on values of "es": + * + * es & 1: page_lookup_create will attempt page relocation + * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete + * memory thread); this prevents reader-starvation of waiting + * writer thread(s) by giving priority to writers over readers. + * es & SE_RETIRED: caller wants to lock pages even if they are + * retired. Default is to deny the lock if the page is retired. + * + * And yes, we know, the semantics of this function are too complicated. + * It's on the list to be cleaned up. */ int page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) @@ -261,17 +282,14 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) mutex_enter(pse); - /* - * Current uses of 'es': - * es == 1 page_lookup_create will attempt page relocation - * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete - * memory thread); this prevents reader-starvation of waiting - * writer thread(s). - */ - - ASSERT(((es & SE_EXCL_WANTED) == 0) || - ((es == SE_EXCL_WANTED) && (se == SE_EXCL))); + ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); + + if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { + mutex_exit(pse); + VM_STAT_ADD(page_lock_retired); + return (0); + } if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { se = SE_EXCL; @@ -312,7 +330,7 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) } if (se == SE_EXCL) { - if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { + if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { /* * if the caller wants a writer lock (but did not * specify exclusive access), and there is a pending @@ -327,7 +345,7 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) retval = 1; } else { /* page is locked */ - if (es == SE_EXCL_WANTED) { + if (es & SE_EXCL_WANTED) { /* set the SE_EWANTED bit */ pp->p_selock |= SE_EWANTED; } @@ -336,10 +354,17 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) } else { retval = 0; if (pp->p_selock >= 0) { - /* readers are not allowed when excl wanted */ - if (!(pp->p_selock & SE_EWANTED)) { - pp->p_selock += SE_READER; - retval = 1; + /* + * Readers are not allowed when excl wanted or + * a retire is pending. Since kvp pages can take + * a long time to be retired, we make an exception + * for them to avoid hanging threads unnecessarily. + */ + if ((pp->p_selock & SE_EWANTED) == 0) { + if (!PP_PR_REQ(pp) || pp->p_vnode == &kvp) { + pp->p_selock += SE_READER; + retval = 1; + } } } } @@ -468,7 +493,13 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es) old = pp->p_selock; ASSERT(((es & SE_EXCL_WANTED) == 0) || - ((es == SE_EXCL_WANTED) && (se == SE_EXCL))); + ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); + + if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { + mutex_exit(pse); + VM_STAT_ADD(page_trylock_failed); + return (0); + } if (se == SE_SHARED && es == 1 && old == 0) { se = SE_EXCL; @@ -477,11 +508,20 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es) if (se == SE_SHARED) { if (!PP_ISFREE(pp)) { if (old >= 0) { - /* readers are not allowed when excl wanted */ - if (!(old & SE_EWANTED)) { - pp->p_selock = old + SE_READER; - mutex_exit(pse); - return (1); + /* + * Readers are not allowed when excl wanted + * or a retire is pending. Since kvp pages can + * take a long time to be retired, we make an + * exception for them to avoid hanging threads + * unnecessarily. + */ + if ((old & SE_EWANTED) == 0) { + if (!PP_PR_REQ(pp) || + pp->p_vnode == &kvp) { + pp->p_selock = old + SE_READER; + mutex_exit(pse); + return (1); + } } } mutex_exit(pse); @@ -498,7 +538,7 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es) * SE_EWANTED is not set, or if the caller specified * SE_EXCL_WANTED. */ - if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) { + if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { if ((old & ~SE_EWANTED) == 0) { /* no reader/writer lock held */ THREAD_KPRI_REQUEST(); @@ -508,7 +548,7 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es) return (1); } } - if (es == SE_EXCL_WANTED) { + if (es & SE_EXCL_WANTED) { /* page is locked, set the SE_EWANTED bit */ pp->p_selock |= SE_EWANTED; } @@ -526,9 +566,15 @@ page_trylock(page_t *pp, se_t se) kmutex_t *pse = PAGE_SE_MUTEX(pp); mutex_enter(pse); - if (pp->p_selock & SE_EWANTED) { - /* fail if a thread wants exclusive access */ + if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || + (se == SE_SHARED && PP_PR_REQ(pp) && pp->p_vnode != &kvp)) { + /* + * Fail if a thread wants exclusive access and page is + * retired, if the page is slated for retirement, or a + * share lock is requested. + */ mutex_exit(pse); + VM_STAT_ADD(page_trylock_failed); return (0); } @@ -551,6 +597,41 @@ page_trylock(page_t *pp, se_t se) } /* + * Variant of page_unlock() specifically for the page freelist + * code. The mere existence of this code is a vile hack that + * has resulted due to the backwards locking order of the page + * freelist manager; please don't call it. + */ +void +page_unlock_noretire(page_t *pp) +{ + kmutex_t *pse = PAGE_SE_MUTEX(pp); + selock_t old; + + mutex_enter(pse); + + old = pp->p_selock; + if ((old & ~SE_EWANTED) == SE_READER) { + pp->p_selock = old & ~SE_READER; + if (CV_HAS_WAITERS(&pp->p_cv)) + cv_broadcast(&pp->p_cv); + } else if ((old & ~SE_EWANTED) == SE_DELETED) { + panic("page_unlock_noretire: page %p is deleted", pp); + } else if (old < 0) { + THREAD_KPRI_RELEASE(); + pp->p_selock &= SE_EWANTED; + if (CV_HAS_WAITERS(&pp->p_cv)) + cv_broadcast(&pp->p_cv); + } else if ((old & ~SE_EWANTED) > SE_READER) { + pp->p_selock = old - SE_READER; + } else { + panic("page_unlock_noretire: page %p is not locked", pp); + } + + mutex_exit(pse); +} + +/* * Release the page's "shared/exclusive" lock and wake up anyone * who might be waiting for it. */ @@ -561,6 +642,7 @@ page_unlock(page_t *pp) selock_t old; mutex_enter(pse); + old = pp->p_selock; if ((old & ~SE_EWANTED) == SE_READER) { pp->p_selock = old & ~SE_READER; @@ -578,7 +660,29 @@ page_unlock(page_t *pp) } else { panic("page_unlock: page %p is not locked", pp); } - mutex_exit(pse); + + if (pp->p_selock == 0 && PP_PR_REQ(pp)) { + /* + * Try to retire the page. If it retires, great. + * If not, oh well, we'll get it in the next unlock + * request, and repeat the cycle. Regardless, + * page_tryretire() will drop the page lock. + */ + if ((pp->p_toxic & PR_BUSY) == 0) { + THREAD_KPRI_REQUEST(); + pp->p_selock = SE_WRITER; + page_settoxic(pp, PR_BUSY); + mutex_exit(pse); + page_tryretire(pp); + } else { + pp->p_selock = SE_WRITER; + page_clrtoxic(pp, PR_BUSY); + pp->p_selock = 0; + mutex_exit(pse); + } + } else { + mutex_exit(pse); + } } /* diff --git a/usr/src/uts/common/vm/page_retire.c b/usr/src/uts/common/vm/page_retire.c new file mode 100644 index 0000000000..30b218c15d --- /dev/null +++ b/usr/src/uts/common/vm/page_retire.c @@ -0,0 +1,1473 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Page Retire - Big Theory Statement. + * + * This file handles removing sections of faulty memory from use when the + * user land FMA Diagnosis Engine requests that a page be removed or when + * a CE or UE is detected by the hardware. + * + * In the bad old days, the kernel side of Page Retire did a lot of the work + * on its own. Now, with the DE keeping track of errors, the kernel side is + * rather simple minded on most platforms. + * + * Errors are all reflected to the DE, and after digesting the error and + * looking at all previously reported errors, the DE decides what should + * be done about the current error. If the DE wants a particular page to + * be retired, then the kernel page retire code is invoked via an ioctl. + * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling + * page retire to handle the error. Since page retire is just a simple + * mechanism it doesn't need to differentiate between the different callers. + * + * The p_toxic field in the page_t is used to indicate which errors have + * occurred and what action has been taken on a given page. Because errors are + * reported without regard to the locked state of a page, no locks are used + * to SET the error bits in p_toxic. However, in order to clear the error + * bits, the page_t must be held exclusively locked. + * + * When page_retire() is called, it must be able to acquire locks, sleep, etc. + * It must not be called from high-level interrupt context. + * + * Depending on how the requested page is being used at the time of the retire + * request (and on the availability of sufficient system resources), the page + * may be retired immediately, or just marked for retirement later. For + * example, locked pages are marked, while free pages are retired. Multiple + * requests may be made to retire the same page, although there is no need + * to: once the p_toxic flags are set, the page will be retired as soon as it + * can be exclusively locked. + * + * The retire mechanism is driven centrally out of page_unlock(). To expedite + * the retirement of pages, further requests for SE_SHARED locks are denied + * as long as a page retirement is pending. In addition, as long as pages are + * pending retirement a background thread runs periodically trying to retire + * those pages. Pages which could not be retired while the system is running + * are scrubbed prior to rebooting to avoid latent errors on the next boot. + * + * Single CE pages and UE pages without persistent errors are scrubbed and + * returned to service. Recidivist pages, as well as FMA-directed requests + * for retirement, result in the page being taken out of service. Once the + * decision is made to take a page out of service, the page is cleared, hashed + * onto the retired_pages vnode, marked as retired, and it is unlocked. No + * other requesters (except for unretire) are allowed to lock retired pages. + * + * The public routines return (sadly) 0 if they worked and a non-zero error + * value if something went wrong. This is done for the ioctl side of the + * world to allow errors to be reflected all the way out to user land. The + * non-zero values are explained in comments atop each function. + */ + +/* + * Things to fix: + * + * 1. Cleanup SE_EWANTED. Since we're aggressive about trying to retire + * pages, we can use page_retire_pp() to replace SE_EWANTED and all + * the special delete_memory_thread() code just goes away. + * + * 2. Trying to retire non-relocatable kvp pages may result in a + * quagmire. This is because seg_kmem() no longer keeps its pages locked, + * and calls page_lookup() in the free path; since kvp pages are modified + * and don't have a usable backing store, page_retire() can't do anything + * with them, and we'll keep denying the lock to seg_kmem_free() in a + * vicious cycle. To prevent that, we don't deny locks to kvp pages, and + * hence only call page_retire_pp() from page_unlock() in the free path. + * Since most kernel pages are indefinitely held anyway, and don't + * participate in I/O, this is of little consequence. + * + * 3. Low memory situations will be interesting. If we don't have + * enough memory for page_relocate() to succeed, we won't be able to + * retire dirty pages; nobody will be able to push them out to disk + * either, since we aggressively deny the page lock. We could change + * fsflush so it can recognize this situation, grab the lock, and push + * the page out, where we'll catch it in the free path and retire it. + * + * 4. Beware of places that have code like this in them: + * + * if (! page_tryupgrade(pp)) { + * page_unlock(pp); + * while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) { + * / *NOTHING* / + * } + * } + * page_free(pp); + * + * The problem is that pp can change identity right after the + * page_unlock() call. In particular, page_retire() can step in + * there, change pp's identity, and hash pp onto the retired_vnode. + * + * Of course, other functions besides page_retire() can have the + * same effect. A kmem reader can waltz by, set up a mapping to the + * page, and then unlock the page. Page_free() will then go castors + * up. So if anybody is doing this, it's already a bug. + * + * 5. mdboot()'s call into page_retire_hunt() should probably be + * moved lower. Where the call is made now, we can get into trouble + * by scrubbing a kernel page that is then accessed later. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mman.h> +#include <sys/vnode.h> +#include <sys/cmn_err.h> +#include <sys/ksynch.h> +#include <sys/thread.h> +#include <sys/disp.h> +#include <sys/ontrap.h> +#include <sys/vmsystm.h> +#include <sys/mem_config.h> +#include <sys/atomic.h> +#include <sys/callb.h> +#include <vm/page.h> +#include <vm/vm_dep.h> +#include <vm/as.h> +#include <vm/hat.h> + +/* + * vnode for all pages which are retired from the VM system; + */ +vnode_t *retired_pages; + +/* + * Background thread that wakes up periodically to try to retire pending + * pages. This prevents threads from becoming blocked indefinitely in + * page_lookup() or some other routine should the page(s) they are waiting + * on become eligible for social security. + */ +static void page_retire_thread(void); +static kthread_t *pr_thread_id; +static kcondvar_t pr_cv; +static kmutex_t pr_thread_mutex; +static clock_t pr_thread_shortwait; +static clock_t pr_thread_longwait; + +/* + * Make a list of all of the pages that have been marked for retirement + * but are not yet retired. At system shutdown, we will scrub all of the + * pages in the list in case there are outstanding UEs. Then, we + * cross-check this list against the number of pages that are yet to be + * retired, and if we find inconsistencies, we scan every page_t in the + * whole system looking for any pages that need to be scrubbed for UEs. + * The background thread also uses this queue to determine which pages + * it should keep trying to retire. + */ +#ifdef DEBUG +#define PR_PENDING_QMAX 32 +#else /* DEBUG */ +#define PR_PENDING_QMAX 256 +#endif /* DEBUG */ +page_t *pr_pending_q[PR_PENDING_QMAX]; +kmutex_t pr_q_mutex; + +/* + * Page retire global kstats + */ +struct page_retire_kstat { + kstat_named_t pr_retired; + kstat_named_t pr_requested; + kstat_named_t pr_requested_free; + kstat_named_t pr_enqueue_fail; + kstat_named_t pr_dequeue_fail; + kstat_named_t pr_pending; + kstat_named_t pr_failed; + kstat_named_t pr_failed_kernel; + kstat_named_t pr_limit; + kstat_named_t pr_limit_exceeded; + kstat_named_t pr_fma; + kstat_named_t pr_mce; + kstat_named_t pr_ue; + kstat_named_t pr_ue_cleared_retire; + kstat_named_t pr_ue_cleared_free; + kstat_named_t pr_ue_persistent; + kstat_named_t pr_unretired; +}; + +static struct page_retire_kstat page_retire_kstat = { + { "pages_retired", KSTAT_DATA_UINT64}, + { "pages_retire_request", KSTAT_DATA_UINT64}, + { "pages_retire_request_free", KSTAT_DATA_UINT64}, + { "pages_notenqueued", KSTAT_DATA_UINT64}, + { "pages_notdequeued", KSTAT_DATA_UINT64}, + { "pages_pending", KSTAT_DATA_UINT64}, + { "pages_deferred", KSTAT_DATA_UINT64}, + { "pages_deferred_kernel", KSTAT_DATA_UINT64}, + { "pages_limit", KSTAT_DATA_UINT64}, + { "pages_limit_exceeded", KSTAT_DATA_UINT64}, + { "pages_fma", KSTAT_DATA_UINT64}, + { "pages_multiple_ce", KSTAT_DATA_UINT64}, + { "pages_ue", KSTAT_DATA_UINT64}, + { "pages_ue_cleared_retired", KSTAT_DATA_UINT64}, + { "pages_ue_cleared_freed", KSTAT_DATA_UINT64}, + { "pages_ue_persistent", KSTAT_DATA_UINT64}, + { "pages_unretired", KSTAT_DATA_UINT64}, +}; + +static kstat_t *page_retire_ksp = NULL; + +#define PR_INCR_KSTAT(stat) \ + atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1) +#define PR_DECR_KSTAT(stat) \ + atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1) + +#define PR_KSTAT_RETIRED_CE (page_retire_kstat.pr_mce.value.ui64) +#define PR_KSTAT_RETIRED_FMA (page_retire_kstat.pr_fma.value.ui64) +#define PR_KSTAT_RETIRED_NOTUE (PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA) +#define PR_KSTAT_PENDING (page_retire_kstat.pr_pending.value.ui64) +#define PR_KSTAT_EQFAIL (page_retire_kstat.pr_enqueue_fail.value.ui64) +#define PR_KSTAT_DQFAIL (page_retire_kstat.pr_dequeue_fail.value.ui64) + +/* + * Limit the number of multiple CE page retires. + * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in + * basis points, where 100 basis points equals one percent. + */ +#define MCE_BPT 10 +uint64_t max_pages_retired_bps = MCE_BPT; +#define PAGE_RETIRE_LIMIT ((physmem * max_pages_retired_bps) / 10000) + +/* + * Control over the verbosity of page retirement. + * + * When set to zero (the default), no messages will be printed. + * When set to one, summary messages will be printed. + * When set > one, all messages will be printed. + * + * A value of one will trigger detailed messages for retirement operations, + * and is intended as a platform tunable for processors where FMA's DE does + * not run (e.g., spitfire). Values > one are intended for debugging only. + */ +int page_retire_messages = 0; + +/* + * Control whether or not we retire dirty UE pages. By default we do + * since we assume the data is corrupt and the process(es) using it will + * be killed. This is platform tunable only, and should probably not be + * changed, ever. + */ +int page_retire_modified = 1; + +/* + * Control whether or not we return scrubbed UE pages to service. + * By default we do not since FMA wants to run its diagnostics first + * and then ask us to unretire the page if it passes. Non-FMA platforms + * may set this to zero so we will only retire recidivist pages. It should + * not be changed by the user. + */ +int page_retire_first_ue = 1; + +/* + * Master enable for page retire. This prevents a CE or UE early in boot + * from trying to retire a page before page_retire_init() has finished + * setting things up. This is internal only and is not a tunable! + */ +static int pr_enable = 0; + +extern struct vnode kvp; + +#ifdef DEBUG +struct page_retire_debug { + int prd_dup; + int prd_noaction; + int prd_queued; + int prd_notqueued; + int prd_dequeue; + int prd_top; + int prd_locked; + int prd_reloc; + int prd_modce; + int prd_modue_fail; + int prd_modue_retire; + int prd_kern; + int prd_free; + int prd_noreclaim; + int prd_hashout; + int prd_fma; + int prd_uescrubbed; + int prd_uenotscrubbed; + int prd_mce; + int prd_prlocked; + int prd_prnotlocked; + int prd_prretired; + int prd_ulocked; + int prd_unotretired; + int prd_udestroy; + int prd_uhashout; + int prd_uunretired; + int prd_unotlocked; + int prd_checkhit; + int prd_checkmiss; + int prd_tctop; + int prd_tclocked; + int prd_hunt; + int prd_dohunt; + int prd_earlyhunt; + int prd_latehunt; + int prd_nofreedemote; + int prd_nodemote; + int prd_demoted; +} pr_debug; + +#define PR_DEBUG(foo) ((pr_debug.foo)++) + +/* + * A type histogram. We record the incidence of the various toxic + * flag combinations along with the interesting page attributes. The + * goal is to get as many combinations as we can while driving all + * pr_debug values nonzero (indicating we've exercised all possible + * code paths across all possible page types). Not all combinations + * will make sense -- e.g. PRT_MOD|PRT_KERNEL. + * + * pr_type offset bit encoding (when examining with a debugger): + * + * PRT_NAMED - 0x4 + * PRT_KERNEL - 0x8 + * PRT_FREE - 0x10 + * PRT_MOD - 0x20 + * PRT_FMA - 0x0 + * PRT_MCE - 0x40 + * PRT_UE - 0x80 + */ + +#define PRT_NAMED 0x01 +#define PRT_KERNEL 0x02 +#define PRT_FREE 0x04 +#define PRT_MOD 0x08 +#define PRT_FMA 0x00 /* yes, this is not a mistake */ +#define PRT_MCE 0x10 +#define PRT_UE 0x20 +#define PRT_ALL 0x3F + +int pr_types[PRT_ALL+1]; + +#define PR_TYPES(pp) { \ + int whichtype = 0; \ + if (pp->p_vnode) \ + whichtype |= PRT_NAMED; \ + if (pp->p_vnode == &kvp) \ + whichtype |= PRT_KERNEL; \ + if (PP_ISFREE(pp)) \ + whichtype |= PRT_FREE; \ + if (hat_ismod(pp)) \ + whichtype |= PRT_MOD; \ + if (pp->p_toxic & PR_UE) \ + whichtype |= PRT_UE; \ + if (pp->p_toxic & PR_MCE) \ + whichtype |= PRT_MCE; \ + pr_types[whichtype]++; \ +} + +int recl_calls; +int recl_mtbf = 3; +int reloc_calls; +int reloc_mtbf = 7; +int pr_calls; +int pr_mtbf = 15; + +#define MTBF(v, f) (((++(v)) & (f)) != (f)) + +#else /* DEBUG */ + +#define PR_DEBUG(foo) /* nothing */ +#define PR_TYPES(foo) /* nothing */ +#define MTBF(v, f) (1) + +#endif /* DEBUG */ + +/* + * page_retire_done() - completion processing + * + * Used by the page_retire code for common completion processing. + * It keeps track of how many times a given result has happened, + * and writes out an occasional message. + * + * May be called with a NULL pp (PRD_INVALID_PA case). + */ +#define PRD_INVALID_KEY -1 +#define PRD_SUCCESS 0 +#define PRD_PENDING 1 +#define PRD_FAILED 2 +#define PRD_DUPLICATE 3 +#define PRD_INVALID_PA 4 +#define PRD_LIMIT 5 +#define PRD_UE_SCRUBBED 6 +#define PRD_UNR_SUCCESS 7 +#define PRD_UNR_CANTLOCK 8 +#define PRD_UNR_NOT 9 + +typedef struct page_retire_op { + int pr_key; /* one of the PRD_* defines from above */ + int pr_count; /* How many times this has happened */ + int pr_retval; /* return value */ + int pr_msglvl; /* message level - when to print */ + char *pr_message; /* Cryptic message for field service */ +} page_retire_op_t; + +static page_retire_op_t page_retire_ops[] = { + /* key count retval msglvl message */ + {PRD_SUCCESS, 0, 0, 1, + "Page 0x%08x.%08x removed from service"}, + {PRD_PENDING, 0, EAGAIN, 2, + "Page 0x%08x.%08x will be retired on free"}, + {PRD_FAILED, 0, EAGAIN, 0, NULL}, + {PRD_DUPLICATE, 0, EBUSY, 2, + "Page 0x%08x.%08x already retired"}, + {PRD_INVALID_PA, 0, EINVAL, 2, + "PA 0x%08x.%08x is not a relocatable page"}, + {PRD_LIMIT, 0, 0, 1, + "Page 0x%08x.%08x not retired due to limit exceeded"}, + {PRD_UE_SCRUBBED, 0, 0, 1, + "Previously reported error on page 0x%08x.%08x cleared"}, + {PRD_UNR_SUCCESS, 0, 0, 1, + "Page 0x%08x.%08x returned to service"}, + {PRD_UNR_CANTLOCK, 0, EAGAIN, 2, + "Page 0x%08x.%08x could not be unretired"}, + {PRD_UNR_NOT, 0, EBADF, 2, + "Page 0x%08x.%08x is not retired"}, + {PRD_INVALID_KEY, 0, 0, 0, NULL} /* MUST BE LAST! */ +}; + +/* + * print a message if page_retire_messages is true. + */ +#define PR_MESSAGE(debuglvl, msglvl, msg, pa) \ +{ \ + uint64_t p = (uint64_t)pa; \ + if (page_retire_messages >= msglvl && msg != NULL) { \ + cmn_err(debuglvl, msg, \ + (uint32_t)(p >> 32), (uint32_t)p); \ + } \ +} + +/* + * Note that multiple bits may be set in a single settoxic operation. + * May be called without the page locked. + */ +void +page_settoxic(page_t *pp, uchar_t bits) +{ + atomic_or_8(&pp->p_toxic, bits); +} + +/* + * Note that multiple bits may cleared in a single clrtoxic operation. + * Must be called with the page exclusively locked. + */ +void +page_clrtoxic(page_t *pp, uchar_t bits) +{ + ASSERT(PAGE_EXCL(pp)); + atomic_and_8(&pp->p_toxic, ~bits); +} + +/* + * Prints any page retire messages to the user, and decides what + * error code is appropriate for the condition reported. + */ +static int +page_retire_done(page_t *pp, int code) +{ + page_retire_op_t *prop; + uint64_t pa = 0; + int i; + + if (pp != NULL) { + pa = mmu_ptob(pp->p_pagenum); + } + + prop = NULL; + for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) { + if (page_retire_ops[i].pr_key == code) { + prop = &page_retire_ops[i]; + break; + } + } + +#ifdef DEBUG + if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) { + cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code); + } +#endif + + ASSERT(prop->pr_key == code); + + prop->pr_count++; + + PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa); + if (pp != NULL) { + page_settoxic(pp, PR_MSG); + } + + return (prop->pr_retval); +} + +/* + * On a reboot, our friend mdboot() wants to clear up any PP_PR_REQ() pages + * that we were not able to retire. On large machines, walking the complete + * page_t array and looking at every page_t takes too long. So, as a page is + * marked toxic, we track it using a list that can be processed at reboot + * time. page_retire_enqueue() will do its best to try to avoid duplicate + * entries, but if we get too many errors at once the queue can overflow, + * in which case we will end up walking every page_t as a last resort. + * The background thread also makes use of this queue to find which pages + * are pending retirement. + */ +static void +page_retire_enqueue(page_t *pp) +{ + int nslot = -1; + int i; + + mutex_enter(&pr_q_mutex); + + /* + * Check to make sure retire hasn't already dequeued it. + * In the meantime if the page was cleaned up, no need + * to enqueue it. + */ + if (PP_RETIRED(pp) || pp->p_toxic == 0) { + mutex_exit(&pr_q_mutex); + PR_DEBUG(prd_noaction); + return; + } + + for (i = 0; i < PR_PENDING_QMAX; i++) { + if (pr_pending_q[i] == pp) { + mutex_exit(&pr_q_mutex); + PR_DEBUG(prd_dup); + return; + } else if (nslot == -1 && pr_pending_q[i] == NULL) { + nslot = i; + } + } + + PR_INCR_KSTAT(pr_pending); + + if (nslot != -1) { + pr_pending_q[nslot] = pp; + PR_DEBUG(prd_queued); + } else { + PR_INCR_KSTAT(pr_enqueue_fail); + PR_DEBUG(prd_notqueued); + } + mutex_exit(&pr_q_mutex); +} + +static void +page_retire_dequeue(page_t *pp) +{ + int i; + + mutex_enter(&pr_q_mutex); + + for (i = 0; i < PR_PENDING_QMAX; i++) { + if (pr_pending_q[i] == pp) { + pr_pending_q[i] = NULL; + break; + } + } + + if (i == PR_PENDING_QMAX) { + PR_INCR_KSTAT(pr_dequeue_fail); + } + + PR_DECR_KSTAT(pr_pending); + PR_DEBUG(prd_dequeue); + + mutex_exit(&pr_q_mutex); +} + +/* + * Act like page_destroy(), but instead of freeing the page, hash it onto + * the retired_pages vnode, and mark it retired. + * + * For fun, we try to scrub the page until it's squeaky clean. + * availrmem is adjusted here. + */ +static void +page_retire_destroy(page_t *pp) +{ + ASSERT(PAGE_EXCL(pp)); + ASSERT(!PP_ISFREE(pp)); + ASSERT(pp->p_szc == 0); + ASSERT(!hat_page_is_mapped(pp)); + ASSERT(!pp->p_vnode); + + page_clr_all_props(pp); + pagescrub(pp, 0, MMU_PAGESIZE); + + pp->p_next = NULL; + pp->p_prev = NULL; + if (page_hashin(pp, retired_pages, (u_offset_t)pp, NULL) == 0) { + cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); + } + + page_settoxic(pp, PR_RETIRED); + page_clrtoxic(pp, PR_BUSY); + page_retire_dequeue(pp); + PR_INCR_KSTAT(pr_retired); + + if (pp->p_toxic & PR_FMA) { + PR_INCR_KSTAT(pr_fma); + } else if (pp->p_toxic & PR_UE) { + PR_INCR_KSTAT(pr_ue); + } else { + PR_INCR_KSTAT(pr_mce); + } + + mutex_enter(&freemem_lock); + availrmem--; + mutex_exit(&freemem_lock); + + page_unlock(pp); +} + +/* + * Check whether the number of pages which have been retired already exceeds + * the maximum allowable percentage of memory which may be retired. + * + * Returns 1 if the limit has been exceeded. + */ +static int +page_retire_limit(void) +{ + if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) { + PR_INCR_KSTAT(pr_limit_exceeded); + return (1); + } + + return (0); +} + +#define MSG_DM "Data Mismatch occurred at PA 0x%08x.%08x" \ + "[ 0x%x != 0x%x ] while attempting to clear previously " \ + "reported error; page removed from service" + +#define MSG_UE "Uncorrectable Error occurred at PA 0x%08x.%08x while " \ + "attempting to clear previously reported error; page removed " \ + "from service" + +/* + * Attempt to clear a UE from a page. + * Returns 1 if the error has been successfully cleared. + */ +static int +page_clear_transient_ue(page_t *pp) +{ + caddr_t kaddr; + uint8_t rb, wb; + uint64_t pa; + uint32_t pa_hi, pa_lo; + on_trap_data_t otd; + int errors = 0; + int i; + + ASSERT(PAGE_EXCL(pp)); + ASSERT(PP_PR_REQ(pp)); + ASSERT(pp->p_szc == 0); + ASSERT(!hat_page_is_mapped(pp)); + + /* + * Clear the page and attempt to clear the UE. If we trap + * on the next access to the page, we know the UE has recurred. + */ + pagescrub(pp, 0, PAGESIZE); + + /* + * Map the page and write a bunch of bit patterns to compare + * what we wrote with what we read back. This isn't a perfect + * test but it should be good enough to catch most of the + * recurring UEs. If this fails to catch a recurrent UE, we'll + * retire the page the next time we see a UE on the page. + */ + kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1); + + pa = ptob((uint64_t)page_pptonum(pp)); + pa_hi = (uint32_t)(pa >> 32); + pa_lo = (uint32_t)pa; + + /* + * Fill the page with each (0x00 - 0xFF] bit pattern, flushing + * the cache in between reading and writing. We do this under + * on_trap() protection to avoid recursion. + */ + if (on_trap(&otd, OT_DATA_EC)) { + PR_MESSAGE(CE_WARN, 1, MSG_UE, pa); + errors = 1; + } else { + for (wb = 0xff; wb > 0; wb--) { + for (i = 0; i < PAGESIZE; i++) { + kaddr[i] = wb; + } + + sync_data_memory(kaddr, PAGESIZE); + + for (i = 0; i < PAGESIZE; i++) { + rb = kaddr[i]; + if (rb != wb) { + /* + * We had a mismatch without a trap. + * Uh-oh. Something is really wrong + * with this system. + */ + if (page_retire_messages) { + cmn_err(CE_WARN, MSG_DM, + pa_hi, pa_lo, rb, wb); + } + errors = 1; + goto out; /* double break */ + } + } + } + } +out: + no_trap(); + ppmapout(kaddr); + + return (errors ? 0 : 1); +} + +/* + * Try to clear a page_t with a single UE. If the UE was transient, it is + * returned to service, and we return 1. Otherwise we return 0 meaning + * that further processing is required to retire the page. + */ +static int +page_retire_transient_ue(page_t *pp) +{ + ASSERT(PAGE_EXCL(pp)); + ASSERT(!hat_page_is_mapped(pp)); + + /* + * If this page is a repeat offender, retire him under the + * "two strikes and you're out" rule. The caller is responsible + * for scrubbing the page to try to clear the error. + */ + if (pp->p_toxic & PR_UE_SCRUBBED) { + PR_INCR_KSTAT(pr_ue_persistent); + return (0); + } + + if (page_clear_transient_ue(pp)) { + /* + * We set the PR_SCRUBBED_UE bit; if we ever see this + * page again, we will retire it, no questions asked. + */ + page_settoxic(pp, PR_UE_SCRUBBED); + + if (page_retire_first_ue) { + PR_INCR_KSTAT(pr_ue_cleared_retire); + return (0); + } else { + PR_INCR_KSTAT(pr_ue_cleared_free); + + page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY); + page_retire_dequeue(pp); + + /* + * Clear the free bit if it's set, since the + * page free code will get cranky if we don't. + */ + PP_CLRFREE(pp); + + /* LINTED: CONSTCOND */ + VN_DISPOSE(pp, B_FREE, 1, kcred); + return (1); + } + } + + PR_INCR_KSTAT(pr_ue_persistent); + return (0); +} + +/* + * Update the statistics dynamically when our kstat is read. + */ +static int +page_retire_kstat_update(kstat_t *ksp, int rw) +{ + struct page_retire_kstat *pr; + + if (ksp == NULL) + return (EINVAL); + + switch (rw) { + + case KSTAT_READ: + pr = (struct page_retire_kstat *)ksp->ks_data; + ASSERT(pr == &page_retire_kstat); + pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT; + return (0); + + case KSTAT_WRITE: + return (EACCES); + + default: + return (EINVAL); + } + /*NOTREACHED*/ +} + +/* + * Initialize the page retire mechanism: + * + * - Establish the correctable error retire limit. + * - Initialize locks. + * - Build the retired_pages vnode. + * - Set up the kstats. + * - Fire off the background thread. + * - Tell page_tryretire() it's OK to start retiring pages. + */ +void +page_retire_init(void) +{ + const fs_operation_def_t retired_vnodeops_template[] = {NULL, NULL}; + struct vnodeops *vops; + + const uint_t page_retire_ndata = + sizeof (page_retire_kstat) / sizeof (kstat_named_t); + + ASSERT(page_retire_ksp == NULL); + + if (max_pages_retired_bps <= 0) { + max_pages_retired_bps = MCE_BPT; + } + + mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL); + + retired_pages = vn_alloc(KM_SLEEP); + if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) { + cmn_err(CE_PANIC, + "page_retired_init: can't make retired vnodeops"); + } + vn_setops(retired_pages, vops); + + if ((page_retire_ksp = kstat_create("unix", 0, "page_retire", + "misc", KSTAT_TYPE_NAMED, page_retire_ndata, + KSTAT_FLAG_VIRTUAL)) == NULL) { + cmn_err(CE_WARN, "kstat_create for page_retire failed"); + } else { + page_retire_ksp->ks_data = (void *)&page_retire_kstat; + page_retire_ksp->ks_update = page_retire_kstat_update; + kstat_install(page_retire_ksp); + } + + pr_thread_shortwait = 23 * hz; + pr_thread_longwait = 1201 * hz; + mutex_init(&pr_thread_mutex, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pr_cv, NULL, CV_DEFAULT, NULL); + pr_thread_id = thread_create(NULL, 0, page_retire_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + + pr_enable = 1; +} + +/* + * page_retire_hunt() callback for the retire thread. + */ +static void +page_retire_thread_cb(page_t *pp) +{ + PR_DEBUG(prd_tctop); + if (pp->p_vnode != &kvp && page_trylock(pp, SE_EXCL)) { + PR_DEBUG(prd_tclocked); + page_unlock(pp); + } +} + +/* + * page_retire_hunt() callback for mdboot(). + * + * It is necessary to scrub any failing pages prior to reboot in order to + * prevent a latent error trap from occurring on the next boot. + */ +void +page_retire_mdboot_cb(page_t *pp) +{ + /* + * Don't scrub the kernel, since we might still need it, unless + * we have UEs on the page, in which case we have nothing to lose. + */ + if (pp->p_vnode != &kvp || PP_TOXIC(pp)) { + pp->p_selock = -1; /* pacify ASSERTs */ + pagescrub(pp, 0, PAGESIZE); + pp->p_selock = 0; + } + pp->p_toxic = 0; +} + +/* + * Hunt down any pages in the system that have not yet been retired, invoking + * the provided callback function on each of them. + */ +void +page_retire_hunt(void (*callback)(page_t *)) +{ + page_t *pp; + page_t *first; + int i, found; + + PR_DEBUG(prd_hunt); + + if (PR_KSTAT_PENDING == 0) { + return; + } + + PR_DEBUG(prd_dohunt); + + found = 0; + mutex_enter(&pr_q_mutex); + + for (i = 0; i < PR_PENDING_QMAX; i++) { + if ((pp = pr_pending_q[i]) != NULL) { + mutex_exit(&pr_q_mutex); + callback(pp); + mutex_enter(&pr_q_mutex); + found++; + } + } + + if (PR_KSTAT_EQFAIL == PR_KSTAT_DQFAIL && found == PR_KSTAT_PENDING) { + mutex_exit(&pr_q_mutex); + PR_DEBUG(prd_earlyhunt); + return; + } + mutex_exit(&pr_q_mutex); + + PR_DEBUG(prd_latehunt); + + /* + * We've lost track of a page somewhere. Hunt it down. + */ + memsegs_lock(0); + pp = first = page_first(); + do { + if (PP_PR_REQ(pp)) { + callback(pp); + if (++found == PR_KSTAT_PENDING) { + break; /* got 'em all */ + } + } + } while ((pp = page_next(pp)) != first); + memsegs_unlock(0); +} + +/* + * The page_retire_thread loops forever, looking to see if there are + * pages still waiting to be retired. + */ +static void +page_retire_thread(void) +{ + callb_cpr_t c; + + CALLB_CPR_INIT(&c, &pr_thread_mutex, callb_generic_cpr, "page_retire"); + + mutex_enter(&pr_thread_mutex); + for (;;) { + if (pr_enable && PR_KSTAT_PENDING) { + kmem_reap(); + seg_preap(); + page_retire_hunt(page_retire_thread_cb); + CALLB_CPR_SAFE_BEGIN(&c); + (void) cv_timedwait(&pr_cv, &pr_thread_mutex, + lbolt + pr_thread_shortwait); + CALLB_CPR_SAFE_END(&c, &pr_thread_mutex); + } else { + CALLB_CPR_SAFE_BEGIN(&c); + (void) cv_timedwait(&pr_cv, &pr_thread_mutex, + lbolt + pr_thread_longwait); + CALLB_CPR_SAFE_END(&c, &pr_thread_mutex); + } + } + /*NOTREACHED*/ +} + +/* + * page_retire_pp() decides what to do with a failing page. + * + * When we get a free page (e.g. the scrubber or in the free path) life is + * nice because the page is clean and marked free -- those always retire + * nicely. From there we go by order of difficulty. If the page has data, + * we attempt to relocate its contents to a suitable replacement page. If + * that does not succeed, we look to see if it is clean. If after all of + * this we have a clean, unmapped page (which we usually do!), we retire it. + * If the page is not clean, we still process it regardless on a UE; for + * CEs or FMA requests, we fail leaving the page in service. The page will + * eventually be tried again later. We always return with the page unlocked + * since we are called from page_unlock(). + * + * We don't call panic or do anything fancy down in here. Our boss the DE + * gets paid handsomely to do his job of figuring out what to do when errors + * occur. We just do what he tells us to do. + */ +static int +page_retire_pp(page_t *pp) +{ + int toxic; + + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_iolock_state == 0); + ASSERT(pp->p_szc == 0); + + PR_DEBUG(prd_top); + PR_TYPES(pp); + + toxic = pp->p_toxic; + ASSERT(toxic & PR_REASONS); + + if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) && + page_retire_limit()) { + page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY); + page_retire_dequeue(pp); + page_unlock(pp); + return (page_retire_done(pp, PRD_LIMIT)); + } + + if (PP_ISFREE(pp)) { + PR_DEBUG(prd_free); + if (!MTBF(recl_calls, recl_mtbf) || !page_reclaim(pp, NULL)) { + PR_DEBUG(prd_noreclaim); + PR_INCR_KSTAT(pr_failed); + page_unlock(pp); + return (page_retire_done(pp, PRD_FAILED)); + } + } + + if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISFREE(pp) && + !PP_ISNORELOC(pp) && MTBF(reloc_calls, reloc_mtbf)) { + page_t *newpp; + spgcnt_t count; + + /* + * If we can relocate the page, great! newpp will go + * on without us, and everything is fine. Regardless + * of whether the relocation succeeds, we are still + * going to take `pp' around back and shoot it. + */ + PR_DEBUG(prd_reloc); + newpp = NULL; + if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) { + page_unlock(newpp); + ASSERT(hat_page_getattr(pp, P_MOD) == 0); + } + } + + if (pp->p_vnode == &kvp) { + PR_DEBUG(prd_kern); + PR_INCR_KSTAT(pr_failed_kernel); + page_unlock(pp); + return (page_retire_done(pp, PRD_FAILED)); + } + + if (pp->p_lckcnt || pp->p_cowcnt) { + if (toxic & PR_UE) { + (void) page_clear_lck_cow(pp, 1); + } else { + PR_DEBUG(prd_locked); + PR_INCR_KSTAT(pr_failed); + page_unlock(pp); + return (page_retire_done(pp, PRD_FAILED)); + } + } + + (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); + ASSERT(!PP_ISFREE(pp)); + ASSERT(!hat_page_is_mapped(pp)); + + /* + * If the page is modified, was not relocated, and not toxic, + * we can't retire it without dropping data on the floor. + * + * RFE: we could change fsflush so that it (and only it) will + * be allowed to lock this page and push it out. Once it cleans + * the page, we'd then be able to retire it on the free path. + * In practice, this should be exceedingly rare. + */ + if (hat_ismod(pp)) { + if ((toxic & PR_UE) == 0) { + PR_DEBUG(prd_modce); + PR_INCR_KSTAT(pr_failed); + page_unlock(pp); + return (page_retire_done(pp, PRD_FAILED)); + } else if (page_retire_modified == 0) { + PR_DEBUG(prd_modue_fail); + PR_INCR_KSTAT(pr_failed); + page_unlock(pp); + return (page_retire_done(pp, PRD_FAILED)); + } + PR_DEBUG(prd_modue_retire); + } + + if (pp->p_vnode) { + PR_DEBUG(prd_hashout); + page_hashout(pp, NULL); + } + ASSERT(!pp->p_vnode); + + /* + * The problem page is locked, demoted, unmapped, not free, + * hashed out, and not COW or mlocked (whew!). + * + * Now we select our ammunition, take it around back, and shoot it. + */ + if (toxic & PR_UE) { + if (hat_ismod(pp)) { + /* + * Let the user know we are dropping their data + * on the floor. + */ + PR_MESSAGE(CE_WARN, 1, "Removing modified page " + "0x%08x.%08x from service", + mmu_ptob(pp->p_pagenum)); + } + if (page_retire_transient_ue(pp)) { + PR_DEBUG(prd_uescrubbed); + return (page_retire_done(pp, PRD_UE_SCRUBBED)); + } else { + PR_DEBUG(prd_uenotscrubbed); + page_retire_destroy(pp); + return (page_retire_done(pp, PRD_SUCCESS)); + } + } else if (toxic & PR_FMA) { + PR_DEBUG(prd_fma); + page_retire_destroy(pp); + return (page_retire_done(pp, PRD_SUCCESS)); + } else if (toxic & PR_MCE) { + PR_DEBUG(prd_mce); + page_retire_destroy(pp); + return (page_retire_done(pp, PRD_SUCCESS)); + } + panic("page_retire_pp: bad toxic flags %d", toxic); + /*NOTREACHED*/ +} + +/* + * Try to retire a page when we stumble onto it in the page lock routines. + */ +void +page_tryretire(page_t *pp) +{ + ASSERT(PAGE_EXCL(pp)); + + if (!pr_enable) { + page_unlock(pp); + return; + } + + /* + * If the page is a big page, try to break it up. + * + * If there are other bad pages besides `pp', they will be + * recursively retired for us thanks to a bit of magic. + * If the page is a small page with errors, try to retire it. + */ + if (pp->p_szc > 0) { + if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) { + page_unlock(pp); + PR_DEBUG(prd_nofreedemote); + return; + } else if (!page_try_demote_pages(pp)) { + page_unlock(pp); + PR_DEBUG(prd_nodemote); + return; + } + PR_DEBUG(prd_demoted); + page_unlock(pp); + } else { + (void) page_retire_pp(pp); + } +} + +/* + * page_retire() - the front door in to retire a page. + * + * Ideally, page_retire() would instantly retire the requested page. + * Unfortunately, some pages are locked or otherwise tied up and cannot be + * retired right away. To deal with that, bits are set in p_toxic of the + * page_t. An attempt is made to lock the page; if the attempt is successful, + * we instantly unlock the page counting on page_unlock() to notice p_toxic + * is nonzero and to call back into page_retire_pp(). Success is determined + * by looking to see whether the page has been retired once it has been + * unlocked. + * + * Returns: + * + * - 0 on success, + * - EINVAL when the PA is whacko, + * - EBUSY if the page is already retired, or + * - EAGAIN if the page could not be _immediately_ retired. + */ +int +page_retire(uint64_t pa, uchar_t reason) +{ + page_t *pp; + + ASSERT(reason & PR_REASONS); /* there must be a reason */ + ASSERT(!(reason & ~PR_REASONS)); /* but no other bits */ + + pp = page_numtopp_nolock(mmu_btop(pa)); + if (pp == NULL) { + PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on" + " page 0x%08x.%08x; page is not relocatable memory", pa); + return (page_retire_done(pp, PRD_INVALID_PA)); + } + if (PP_RETIRED(pp)) { + return (page_retire_done(pp, PRD_DUPLICATE)); + } + + if (reason & PR_UE) { + PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on" + " page 0x%08x.%08x", pa); + } else { + PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of" + " page 0x%08x.%08x", pa); + } + page_settoxic(pp, reason); + page_retire_enqueue(pp); + + /* + * And now for some magic. + * + * We marked this page toxic up above. All there is left to do is + * to try to lock the page and then unlock it. The page lock routines + * will intercept the page and retire it if they can. If the page + * cannot be locked, 's okay -- page_unlock() will eventually get it, + * or the background thread, until then the lock routines will deny + * further locks on it. + */ + if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) { + PR_DEBUG(prd_prlocked); + page_unlock(pp); + } else { + PR_DEBUG(prd_prnotlocked); + } + + if (PP_RETIRED(pp)) { + PR_DEBUG(prd_prretired); + return (0); + } else { + cv_signal(&pr_cv); + PR_INCR_KSTAT(pr_failed); + + if (pp->p_toxic & PR_MSG) { + return (page_retire_done(pp, PRD_FAILED)); + } else { + return (page_retire_done(pp, PRD_PENDING)); + } + } +} + +/* + * Take a retired page off the retired-pages vnode and clear the toxic flags. + * If "free" is nonzero, lock it and put it back on the freelist. If "free" + * is zero, the caller already holds SE_EXCL lock so we simply unretire it + * and don't do anything else with it. + * + * Any unretire messages are printed from this routine. + * + * Returns 0 if page pp was unretired; else an error code. + */ +int +page_unretire_pp(page_t *pp, int free) +{ + /* + * To be retired, a page has to be hashed onto the retired_pages vnode + * and have PR_RETIRED set in p_toxic. + */ + if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { + ASSERT(PAGE_EXCL(pp)); + PR_DEBUG(prd_ulocked); + if (!PP_RETIRED(pp)) { + PR_DEBUG(prd_unotretired); + page_unlock(pp); + return (page_retire_done(pp, PRD_UNR_NOT)); + } + + PR_MESSAGE(CE_NOTE, 1, "unretiring retired" + " page 0x%08x.%08x", mmu_ptob(pp->p_pagenum)); + if (pp->p_toxic & PR_FMA) { + PR_DECR_KSTAT(pr_fma); + } else if (pp->p_toxic & PR_UE) { + PR_DECR_KSTAT(pr_ue); + } else { + PR_DECR_KSTAT(pr_mce); + } + page_clrtoxic(pp, PR_ALLFLAGS); + + if (free) { + PR_DEBUG(prd_udestroy); + page_destroy(pp, 0); + } else { + PR_DEBUG(prd_uhashout); + page_hashout(pp, NULL); + } + + mutex_enter(&freemem_lock); + availrmem++; + mutex_exit(&freemem_lock); + + PR_DEBUG(prd_uunretired); + PR_DECR_KSTAT(pr_retired); + PR_INCR_KSTAT(pr_unretired); + return (page_retire_done(pp, PRD_UNR_SUCCESS)); + } + PR_DEBUG(prd_unotlocked); + return (page_retire_done(pp, PRD_UNR_CANTLOCK)); +} + +/* + * Return a page to service by moving it from the retired_pages vnode + * onto the freelist. + * + * Called from mmioctl_page_retire() on behalf of the FMA DE. + * + * Returns: + * + * - 0 if the page is unretired, + * - EAGAIN if the pp can not be locked, + * - EINVAL if the PA is whacko, and + * - EBADF if the pp is not retired. + */ +int +page_unretire(uint64_t pa) +{ + page_t *pp; + + pp = page_numtopp_nolock(mmu_btop(pa)); + if (pp == NULL) { + return (page_retire_done(pp, PRD_INVALID_PA)); + } + + return (page_unretire_pp(pp, 1)); +} + +/* + * Test a page to see if it is retired. If errors is non-NULL, the toxic + * bits of the page are returned. Returns 0 on success, error code on failure. + */ +int +page_retire_check_pp(page_t *pp, uint64_t *errors) +{ + int rc; + + if (PP_RETIRED(pp)) { + PR_DEBUG(prd_checkhit); + rc = 0; + } else { + PR_DEBUG(prd_checkmiss); + rc = EAGAIN; + } + + /* + * We have magically arranged the bit values returned to fmd(1M) + * to line up with the FMA, MCE, and UE bits of the page_t. + */ + if (errors) { + uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK); + if (toxic & PR_UE_SCRUBBED) { + toxic &= ~PR_UE_SCRUBBED; + toxic |= PR_UE; + } + *errors = toxic; + } + + return (rc); +} + +/* + * Test to see if the page_t for a given PA is retired, and return the + * hardware errors we have seen on the page if requested. + * + * Called from mmioctl_page_retire on behalf of the FMA DE. + * + * Returns: + * + * - 0 if the page is retired, + * - EAGAIN if it is not, and + * - EINVAL if the PA is whacko. + */ +int +page_retire_check(uint64_t pa, uint64_t *errors) +{ + page_t *pp; + + if (errors) { + *errors = 0; + } + + pp = page_numtopp_nolock(mmu_btop(pa)); + if (pp == NULL) { + return (page_retire_done(pp, PRD_INVALID_PA)); + } + + return (page_retire_check_pp(pp, errors)); +} + +/* + * Page retire self-test. For now, it always returns 0. + */ +int +page_retire_test(void) +{ + page_t *first, *pp, *cpp, *cpp2, *lpp; + + /* + * Tests the corner case where a large page can't be retired + * because one of the constituent pages is locked. We mark + * one page to be retired and try to retire it, and mark the + * other page to be retired but don't try to retire it, so + * that page_unlock() in the failure path will recurse and try + * to retire THAT page. This is the worst possible situation + * we can get ourselves into. + */ + memsegs_lock(0); + pp = first = page_first(); + do { + if (pp->p_szc && PP_PAGEROOT(pp) == pp) { + cpp = pp + 1; + lpp = PP_ISFREE(pp)? pp : pp + 2; + cpp2 = pp + 3; + if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED)) + continue; + if (!page_trylock(cpp, SE_EXCL)) { + page_unlock(lpp); + continue; + } + page_settoxic(cpp, PR_FMA | PR_BUSY); + page_settoxic(cpp2, PR_FMA); + page_tryretire(cpp); /* will fail */ + page_unlock(lpp); + (void) page_retire(cpp->p_pagenum, PR_FMA); + (void) page_retire(cpp2->p_pagenum, PR_FMA); + } + } while ((pp = page_next(pp)) != first); + memsegs_unlock(0); + + return (0); +} diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 5b3db34db1..27b2702d28 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -87,90 +87,6 @@ static pgcnt_t max_page_get; /* max page_get request size in pages */ pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ /* - * vnode for all pages which are retired from the VM system; - * such as pages with Uncorrectable Errors. - */ -struct vnode retired_ppages; - -static void page_retired_init(void); -static void retired_dispose(vnode_t *vp, page_t *pp, int flag, - int dn, cred_t *cr); -static void retired_inactive(vnode_t *vp, cred_t *cr); -static void page_retired(page_t *pp); -static void retired_page_removed(page_t *pp); -void page_unretire_pages(void); - -/* - * The maximum number of pages that will be unretired in one iteration. - * This number is totally arbitrary. - */ -#define UNRETIRE_PAGES 256 - -/* - * We limit the number of pages that may be retired to - * a percentage of the total physical memory. Note that - * the percentage values are stored as 'basis points', - * ie, 100 basis points is 1%. - */ -#define MAX_PAGES_RETIRED_BPS_DEFAULT 10 /* .1% */ - -uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT; - -static int pages_retired_limit_exceeded(void); - -/* - * operations vector for vnode with retired pages. Only VOP_DISPOSE - * and VOP_INACTIVE are intercepted. - */ -struct vnodeops retired_vnodeops = { - "retired_vnodeops", - fs_nosys, /* open */ - fs_nosys, /* close */ - fs_nosys, /* read */ - fs_nosys, /* write */ - fs_nosys, /* ioctl */ - fs_nosys, /* setfl */ - fs_nosys, /* getattr */ - fs_nosys, /* setattr */ - fs_nosys, /* access */ - fs_nosys, /* lookup */ - fs_nosys, /* create */ - fs_nosys, /* remove */ - fs_nosys, /* link */ - fs_nosys, /* rename */ - fs_nosys, /* mkdir */ - fs_nosys, /* rmdir */ - fs_nosys, /* readdir */ - fs_nosys, /* symlink */ - fs_nosys, /* readlink */ - fs_nosys, /* fsync */ - retired_inactive, - fs_nosys, /* fid */ - fs_rwlock, /* rwlock */ - fs_rwunlock, /* rwunlock */ - fs_nosys, /* seek */ - fs_nosys, /* cmp */ - fs_nosys, /* frlock */ - fs_nosys, /* space */ - fs_nosys, /* realvp */ - fs_nosys, /* getpage */ - fs_nosys, /* putpage */ - fs_nosys_map, - fs_nosys_addmap, - fs_nosys, /* delmap */ - fs_nosys_poll, - fs_nosys, /* dump */ - fs_nosys, /* l_pathconf */ - fs_nosys, /* pageio */ - fs_nosys, /* dumpctl */ - retired_dispose, - fs_nosys, /* setsecattr */ - fs_nosys, /* getsecatt */ - fs_nosys, /* shrlock */ - fs_vnevent_nosupport /* vnevent */ -}; - -/* * freemem_lock protects all freemem variables: * availrmem. Also this lock protects the globals which track the * availrmem changes for accurate kernel footprint calculation. @@ -289,15 +205,6 @@ static kcondvar_t pcgs_cv; /* cv for delay in pcgs */ #define PAGE_LOCK_MAXIMUM \ ((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1) -/* - * Control over the verbosity of page retirement. When set to zero, no messages - * will be printed. A value of one will trigger messages for retirement - * operations, and is intended for processors which don't yet support FMA - * (spitfire). Two will cause verbose messages to be printed when retirements - * complete, and is intended only for debugging purposes. - */ -int page_retire_messages = 0; - #ifdef VM_STATS /* @@ -440,11 +347,7 @@ vm_init(void) (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); page_init_mem_config(); - - /* - * initialise the vnode for retired pages - */ - page_retired_init(); + page_retire_init(); } /* @@ -2799,153 +2702,6 @@ page_free(page_t *pp, int dontneed) ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); - if (page_deteriorating(pp)) { - volatile int i = 0; - char *kaddr; - volatile int rb, wb; - uint64_t pa; - volatile int ue = 0; - on_trap_data_t otd; - - if (pp->p_vnode != NULL) { - /* - * Let page_destroy() do its bean counting and - * hash out the page; it will then call back - * into page_free() with pp->p_vnode == NULL. - */ - page_destroy(pp, 0); - return; - } - - if (page_isfailing(pp)) { - /* - * If we have already exceeded the limit for - * pages retired, we will treat this page as - * 'toxic' rather than failing. That will ensure - * that the page is at least cleaned, and if - * a UE is detected, the page will be retired - * anyway. - */ - if (pages_retired_limit_exceeded()) { - /* - * clear the flag and reset to toxic - */ - page_clrtoxic(pp); - page_settoxic(pp, PAGE_IS_TOXIC); - } else { - pa = ptob((uint64_t)page_pptonum(pp)); - if (page_retire_messages) { - cmn_err(CE_NOTE, "Page 0x%08x.%08x " - "removed from service", - (uint32_t)(pa >> 32), (uint32_t)pa); - } - goto page_failed; - } - } - - pagescrub(pp, 0, PAGESIZE); - - /* - * We want to determine whether the error that occurred on - * this page is transient or persistent, so we get a mapping - * to the page and try every possible bit pattern to compare - * what we write with what we read back. A smaller number - * of bit patterns might suffice, but there's no point in - * getting fancy. If this is the hot path on your system, - * you've got bigger problems. - */ - kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); - for (wb = 0xff; wb >= 0; wb--) { - if (on_trap(&otd, OT_DATA_EC)) { - pa = ptob((uint64_t)page_pptonum(pp)) + i; - page_settoxic(pp, PAGE_IS_FAILING); - - if (page_retire_messages) { - cmn_err(CE_WARN, "Uncorrectable Error " - "occurred at PA 0x%08x.%08x while " - "attempting to clear previously " - "reported error; page removed from " - "service", (uint32_t)(pa >> 32), - (uint32_t)pa); - } - - ue++; - break; - } - - /* - * Write out the bit pattern, flush it to memory, and - * read it back while under on_trap() protection. - */ - for (i = 0; i < PAGESIZE; i++) - kaddr[i] = wb; - - sync_data_memory(kaddr, PAGESIZE); - - for (i = 0; i < PAGESIZE; i++) { - if ((rb = (uchar_t)kaddr[i]) != wb) { - page_settoxic(pp, PAGE_IS_FAILING); - goto out; - } - } - } -out: - no_trap(); - ppmapout(kaddr); - - if (wb >= 0 && !ue) { - pa = ptob((uint64_t)page_pptonum(pp)) + i; - if (page_retire_messages) { - cmn_err(CE_WARN, "Data Mismatch occurred at PA " - "0x%08x.%08x [ 0x%x != 0x%x ] while " - "attempting to clear previously reported " - "error; page removed from service", - (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb); - } - } -page_failed: - /* - * DR operations change the association between a page_t - * and the physical page it represents. Check if the - * page is still bad. If it is, then retire it. - */ - if (page_isfaulty(pp) && page_isfailing(pp)) { - /* - * In the future, it might be useful to have a platform - * callback here to tell the hardware to fence off this - * page during the next reboot. - * - * We move the page to the retired_vnode here - */ - (void) page_hashin(pp, &retired_ppages, - (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL); - mutex_enter(&freemem_lock); - availrmem--; - mutex_exit(&freemem_lock); - page_retired(pp); - page_downgrade(pp); - - /* - * If DR raced with the above page retirement code, - * we might have retired a good page. If so, unretire - * the page. - */ - if (!page_isfaulty(pp)) - page_unretire_pages(); - return; - } - - pa = ptob((uint64_t)page_pptonum(pp)); - - if (page_retire_messages) { - cmn_err(CE_NOTE, "Previously reported error on page " - "0x%08x.%08x cleared", (uint32_t)(pa >> 32), - (uint32_t)pa); - } - - page_clrtoxic(pp); - } - if (PP_ISFREE(pp)) { panic("page_free: page %p is free", (void *)pp); } @@ -3089,7 +2845,6 @@ page_free_pages(page_t *pp) pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); pgcnt_t i; uint_t szc = pp->p_szc; - int toxic = 0; VM_STAT_ADD(pagecnt.pc_free_pages); TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, @@ -3118,9 +2873,6 @@ page_free_pages(page_t *pp) ASSERT(tpp->p_vnode == NULL); ASSERT(tpp->p_szc == szc); - if (page_deteriorating(tpp)) - toxic = 1; - PP_SETFREE(tpp); page_clr_all_props(tpp); PP_SETAGED(tpp); @@ -3131,10 +2883,6 @@ page_free_pages(page_t *pp) } ASSERT(rootpp == pp); - if (toxic) { - page_free_toxic_pages(rootpp); - return; - } page_list_add_pages(rootpp, 0); page_create_putback(pgcnt); } @@ -3219,12 +2967,13 @@ page_reclaim(page_t *pp, kmutex_t *lock) struct pcf *p; uint_t pcf_index; struct cpu *cpup; - int enough; uint_t i; + pgcnt_t npgs, need, collected; ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); - ASSERT(pp->p_szc == 0); + + npgs = page_get_pagecnt(pp->p_szc); /* * If `freemem' is 0, we cannot reclaim this page from the @@ -3254,18 +3003,19 @@ page_reclaim(page_t *pp, kmutex_t *lock) goto page_reclaim_nomem; } - enough = 0; + collected = 0; pcf_index = PCF_INDEX(); p = &pcf[pcf_index]; p->pcf_touch = 1; mutex_enter(&p->pcf_lock); - if (p->pcf_count >= 1) { - enough = 1; - p->pcf_count--; + if (p->pcf_count >= npgs) { + collected = npgs; + p->pcf_count -= npgs; } mutex_exit(&p->pcf_lock); + need = npgs - collected; - if (!enough) { + if (need > 0) { VM_STAT_ADD(page_reclaim_zero); /* * Check again. Its possible that some other thread @@ -3277,15 +3027,22 @@ page_reclaim(page_t *pp, kmutex_t *lock) for (i = 0; i < PCF_FANOUT; i++) { p->pcf_touch = 1; mutex_enter(&p->pcf_lock); - if (p->pcf_count >= 1) { - p->pcf_count -= 1; - enough = 1; - break; + if (p->pcf_count) { + if (p->pcf_count >= need) { + p->pcf_count -= need; + collected += need; + need = 0; + break; + } else if (p->pcf_count) { + collected += p->pcf_count; + need -= p->pcf_count; + p->pcf_count = 0; + } } p++; } - if (!enough) { + if (need > 0) { page_reclaim_nomem: /* * We really can't have page `pp'. @@ -3309,6 +3066,7 @@ page_reclaim_nomem: mutex_enter(&new_freemem_lock); p = pcf; + p->pcf_count += collected; for (i = 0; i < PCF_FANOUT; i++) { p->pcf_wait++; mutex_exit(&p->pcf_lock); @@ -3328,11 +3086,13 @@ page_reclaim_nomem: } /* - * There was a page to be found. + * We beat the PCF bins over the head until + * we got the memory that we wanted. * The pcf accounting has been done, * though none of the pcf_wait flags have been set, * drop the locks and continue on. */ + ASSERT(collected == npgs); while (p >= pcf) { mutex_exit(&p->pcf_lock); p--; @@ -3343,14 +3103,19 @@ page_reclaim_nomem: * freemem is not protected by any lock. Thus, we cannot * have any assertion containing freemem here. */ - freemem -= 1; + freemem -= npgs; VM_STAT_ADD(pagecnt.pc_reclaim); if (PP_ISAGED(pp)) { - page_list_sub(pp, PG_FREE_LIST); + if (npgs > 1) { + page_list_sub_pages(pp, pp->p_szc); + } else { + page_list_sub(pp, PG_FREE_LIST); + } TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, "page_reclaim_free:pp %p", pp); } else { + ASSERT(npgs == 1); page_list_sub(pp, PG_CACHE_LIST); TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, "page_reclaim_cache:pp %p", pp); @@ -3363,9 +3128,11 @@ page_reclaim_nomem: * * Set the reference bit to protect against immediate pageout. */ - PP_CLRFREE(pp); - PP_CLRAGED(pp); - page_set_props(pp, P_REF); + for (i = 0; i < npgs; i++, pp = page_next(pp)) { + PP_CLRFREE(pp); + PP_CLRAGED(pp); + page_set_props(pp, P_REF); + } CPU_STATS_ENTER_K(); cpup = CPU; /* get cpup now that CPU cannot change */ @@ -3441,7 +3208,6 @@ page_destroy_pages(page_t *pp) pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); pgcnt_t i, pglcks = 0; uint_t szc = pp->p_szc; - int toxic = 0; ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); @@ -3471,9 +3237,6 @@ page_destroy_pages(page_t *pp) ASSERT(tpp->p_vnode == NULL); ASSERT(tpp->p_szc == szc); - if (page_deteriorating(tpp)) - toxic = 1; - PP_SETFREE(tpp); page_clr_all_props(tpp); PP_SETAGED(tpp); @@ -3489,10 +3252,6 @@ page_destroy_pages(page_t *pp) mutex_exit(&freemem_lock); } - if (toxic) { - page_free_toxic_pages(rootpp); - return; - } page_list_add_pages(rootpp, 0); page_create_putback(pgcnt); } @@ -3916,14 +3675,6 @@ page_hashout(page_t *pp, kmutex_t *phm) mutex_exit(nphm); /* - * If the page was retired, update the pages_retired - * total and clear the page flag - */ - if (page_isretired(pp)) { - retired_page_removed(pp); - } - - /* * Wake up processes waiting for this page. The page's * identity has been changed, and is probably not the * desired page any longer. @@ -5397,6 +5148,63 @@ page_release(page_t *pp, int checkmod) return (status); } +/* + * Given a constituent page, try to demote the large page on the freelist. + * + * Returns nonzero if the page could be demoted successfully. Returns with + * the constituent page still locked. + */ +int +page_try_demote_free_pages(page_t *pp) +{ + page_t *rootpp = pp; + pfn_t pfn = page_pptonum(pp); + spgcnt_t npgs; + uint_t szc = pp->p_szc; + + ASSERT(PP_ISFREE(pp)); + ASSERT(PAGE_EXCL(pp)); + + /* + * Adjust rootpp and lock it, if `pp' is not the base + * constituent page. + */ + npgs = page_get_pagecnt(pp->p_szc); + if (npgs == 1) { + return (0); + } + + if (!IS_P2ALIGNED(pfn, npgs)) { + pfn = P2ALIGN(pfn, npgs); + rootpp = page_numtopp_nolock(pfn); + } + + if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) { + return (0); + } + + if (rootpp->p_szc != szc) { + if (pp != rootpp) + page_unlock(rootpp); + return (0); + } + + page_demote_free_pages(rootpp); + + if (pp != rootpp) + page_unlock(rootpp); + + ASSERT(PP_ISFREE(pp)); + ASSERT(PAGE_EXCL(pp)); + return (1); +} + +/* + * Given a constituent page, try to demote the large page. + * + * Returns nonzero if the page could be demoted successfully. Returns with + * the constituent page still locked. + */ int page_try_demote_pages(page_t *pp) { @@ -5406,27 +5214,27 @@ page_try_demote_pages(page_t *pp) uint_t szc = pp->p_szc; vnode_t *vp = pp->p_vnode; - ASSERT(PAGE_EXCL(rootpp)); + ASSERT(PAGE_EXCL(pp)); VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]); - if (rootpp->p_szc == 0) { + if (pp->p_szc == 0) { VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]); return (1); } if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) { VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); - page_demote_vp_pages(rootpp); + page_demote_vp_pages(pp); ASSERT(pp->p_szc == 0); return (1); } /* - * Adjust rootpp if passed in is not the base + * Adjust rootpp if passed in is not the base * constituent page. */ - npgs = page_get_pagecnt(rootpp->p_szc); + npgs = page_get_pagecnt(pp->p_szc); ASSERT(npgs > 1); if (!IS_P2ALIGNED(pfn, npgs)) { pfn = P2ALIGN(pfn, npgs); @@ -5455,12 +5263,11 @@ page_try_demote_pages(page_t *pp) break; ASSERT(tpp->p_szc == rootpp->p_szc); ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i); - (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); } /* - * If we failed to lock them all then unlock what we have locked - * so far and bail. + * If we failed to lock them all then unlock what we have + * locked so far and bail. */ if (i < npgs) { tpp = rootpp; @@ -5473,12 +5280,9 @@ page_try_demote_pages(page_t *pp) return (0); } - /* - * XXX probably p_szc clearing and page unlocking can be done within - * one loop but since this is rare code we can play very safe. - */ for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { ASSERT(PAGE_EXCL(tpp)); + (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); tpp->p_szc = 0; } @@ -5490,6 +5294,7 @@ page_try_demote_pages(page_t *pp) if (tpp != pp) page_unlock(tpp); } + VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]); return (1); } @@ -5579,221 +5384,6 @@ page_demote_vp_pages(page_t *pp) } /* - * Page retire operation. - * - * page_retire() - * Attempt to retire (throw away) page pp. We cannot do this if - * the page is dirty; if the page is clean, we can try. We return 0 on - * success, -1 on failure. This routine should be invoked by the platform's - * memory error detection code. - * - * pages_retired_limit_exceeded() - * We set a limit on the number of pages which may be retired. This - * is set to a percentage of total physical memory. This limit is - * enforced here. - */ - -static pgcnt_t retired_pgcnt = 0; - -/* - * routines to update the count of retired pages - */ -static void -page_retired(page_t *pp) -{ - ASSERT(pp); - - page_settoxic(pp, PAGE_IS_RETIRED); - atomic_add_long(&retired_pgcnt, 1); -} - -static void -retired_page_removed(page_t *pp) -{ - ASSERT(pp); - ASSERT(page_isretired(pp)); - ASSERT(retired_pgcnt > 0); - - page_clrtoxic(pp); - atomic_add_long(&retired_pgcnt, -1); -} - - -static int -pages_retired_limit_exceeded() -{ - pgcnt_t retired_max; - - /* - * If the percentage is zero or is not set correctly, - * return TRUE so that pages are not retired. - */ - if (max_pages_retired_bps <= 0 || - max_pages_retired_bps >= 10000) - return (1); - - /* - * Calculate the maximum number of pages allowed to - * be retired as a percentage of total physical memory - * (Remember that we are using basis points, hence the 10000.) - */ - retired_max = (physmem * max_pages_retired_bps) / 10000; - - /* - * return 'TRUE' if we have already retired more - * than the legal limit - */ - return (retired_pgcnt >= retired_max); -} - -#define PAGE_RETIRE_SELOCK 0 -#define PAGE_RETIRE_NORECLAIM 1 -#define PAGE_RETIRE_LOCKED 2 -#define PAGE_RETIRE_COW 3 -#define PAGE_RETIRE_DIRTY 4 -#define PAGE_RETIRE_LPAGE 5 -#define PAGE_RETIRE_SUCCESS 6 -#define PAGE_RETIRE_LIMIT 7 -#define PAGE_RETIRE_NCODES 8 - -typedef struct page_retire_op { - int pr_count; - short pr_unlock; - short pr_retval; - char *pr_message; -} page_retire_op_t; - -page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = { - { 0, 0, -1, "cannot lock page" }, - { 0, 0, -1, "cannot reclaim cached page" }, - { 0, 1, -1, "page is locked" }, - { 0, 1, -1, "copy-on-write page" }, - { 0, 1, -1, "page is dirty" }, - { 0, 1, -1, "cannot demote large page" }, - { 0, 0, 0, "page successfully retired" }, - { 0, 0, -1, "excess pages retired already" }, -}; - -static int -page_retire_done(page_t *pp, int code) -{ - page_retire_op_t *prop = &page_retire_ops[code]; - - prop->pr_count++; - - if (prop->pr_unlock) - page_unlock(pp); - - if (page_retire_messages > 1) { - printf("page_retire(%p) pfn 0x%lx %s: %s\n", - (void *)pp, page_pptonum(pp), - prop->pr_retval == -1 ? "failed" : "succeeded", - prop->pr_message); - } - - return (prop->pr_retval); -} - -int -page_retire(page_t *pp, uchar_t flag) -{ - uint64_t pa = ptob((uint64_t)page_pptonum(pp)); - - ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC); - - /* - * DR operations change the association between a page_t - * and the physical page it represents. Check if the - * page is still bad. - */ - if (!page_isfaulty(pp)) { - page_clrtoxic(pp); - return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); - } - - /* - * We set the flag here so that even if we fail due - * to exceeding the limit for retired pages, the - * page will still be checked and either cleared - * or retired in page_free(). - */ - page_settoxic(pp, flag); - - if (flag == PAGE_IS_TOXIC) { - if (page_retire_messages) { - cmn_err(CE_NOTE, "Scheduling clearing of error on" - " page 0x%08x.%08x", - (uint32_t)(pa >> 32), (uint32_t)pa); - } - - } else { /* PAGE_IS_FAILING */ - if (pages_retired_limit_exceeded()) { - /* - * Return as we have already exceeded the - * maximum number of pages allowed to be - * retired - */ - return (page_retire_done(pp, PAGE_RETIRE_LIMIT)); - } - - if (page_retire_messages) { - cmn_err(CE_NOTE, "Scheduling removal of " - "page 0x%08x.%08x", - (uint32_t)(pa >> 32), (uint32_t)pa); - } - } - - if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL)) - return (page_retire_done(pp, PAGE_RETIRE_SELOCK)); - - /* - * If this is a large page we first try and demote it - * to PAGESIZE pages and then dispose of the toxic page. - * On failure we will let the page free/destroy - * code handle it later since this is a mapped page. - * Note that free large pages can always be demoted. - * - */ - if (pp->p_szc != 0) { - if (PP_ISFREE(pp)) - (void) page_demote_free_pages(pp); - else - (void) page_try_demote_pages(pp); - - if (pp->p_szc != 0) - return (page_retire_done(pp, PAGE_RETIRE_LPAGE)); - } - - if (PP_ISFREE(pp)) { - if (!page_reclaim(pp, NULL)) - return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM)); - /*LINTED: constant in conditional context*/ - VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred) - return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); - } - - if (pp->p_lckcnt != 0) - return (page_retire_done(pp, PAGE_RETIRE_LOCKED)); - - if (pp->p_cowcnt != 0) - return (page_retire_done(pp, PAGE_RETIRE_COW)); - - /* - * Unload all translations to this page. No new translations - * can be created while we hold the exclusive lock on the page. - */ - (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); - - if (hat_ismod(pp)) - return (page_retire_done(pp, PAGE_RETIRE_DIRTY)); - - /*LINTED: constant in conditional context*/ - VN_DISPOSE(pp, B_INVAL, 0, kcred); - - return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); -} - -/* * Mark any existing pages for migration in the given range */ void @@ -6128,140 +5718,6 @@ next: } } -/* - * initialize the vnode for retired pages - */ -static void -page_retired_init(void) -{ - vn_setops(&retired_ppages, &retired_vnodeops); -} - -/* ARGSUSED */ -static void -retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr) -{ - panic("retired_dispose invoked"); -} - -/* ARGSUSED */ -static void -retired_inactive(vnode_t *vp, cred_t *cr) -{} - -void -page_unretire_pages(void) -{ - page_t *pp; - kmutex_t *vphm; - vnode_t *vp; - page_t *rpages[UNRETIRE_PAGES]; - pgcnt_t i, npages, rmem; - uint64_t pa; - - rmem = 0; - - for (;;) { - /* - * We do this in 2 steps: - * - * 1. We walk the retired pages list and collect a list of - * pages that have the toxic field cleared. - * - * 2. We iterate through the page list and unretire each one. - * - * We have to do it in two steps on account of the mutexes that - * we need to acquire. - */ - - vp = &retired_ppages; - vphm = page_vnode_mutex(vp); - mutex_enter(vphm); - - if ((pp = vp->v_pages) == NULL) { - mutex_exit(vphm); - break; - } - - i = 0; - do { - ASSERT(pp != NULL); - ASSERT(pp->p_vnode == vp); - - /* - * DR operations change the association between a page_t - * and the physical page it represents. Check if the - * page is still bad. If not, unretire it. - */ - if (!page_isfaulty(pp)) - rpages[i++] = pp; - - pp = pp->p_vpnext; - } while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES)); - - mutex_exit(vphm); - - npages = i; - for (i = 0; i < npages; i++) { - pp = rpages[i]; - pa = ptob((uint64_t)page_pptonum(pp)); - - /* - * Need to upgrade the shared lock to an exclusive - * lock in order to hash out the page. - * - * The page could have been retired but the page lock - * may not have been downgraded yet. If so, skip this - * page. page_free() will call this function after the - * lock is downgraded. - */ - - if (!PAGE_SHARED(pp) || !page_tryupgrade(pp)) - continue; - - /* - * Both page_free() and DR call this function. They - * can potentially call this function at the same - * time and race with each other. - */ - if (!page_isretired(pp) || page_isfaulty(pp)) { - page_downgrade(pp); - continue; - } - - cmn_err(CE_NOTE, - "unretiring retired page 0x%08x.%08x", - (uint32_t)(pa >> 32), (uint32_t)pa); - - /* - * When a page is removed from the retired pages vnode, - * its toxic field is also cleared. So, we do not have - * to do that seperately here. - */ - page_hashout(pp, (kmutex_t *)NULL); - - /* - * This is a good page. So, free it. - */ - pp->p_vnode = NULL; - page_free(pp, 1); - rmem++; - } - - /* - * If the rpages array was filled up, then there could be more - * retired pages that are not faulty. We need to iterate - * again and unretire them. Otherwise, we are done. - */ - if (npages < UNRETIRE_PAGES) - break; - } - - mutex_enter(&freemem_lock); - availrmem += rmem; - mutex_exit(&freemem_lock); -} - ulong_t mem_waiters = 0; ulong_t max_count = 20; #define MAX_DELAY 0x1ff @@ -6621,90 +6077,48 @@ page_clr_all_props(page_t *pp) } /* - * The following functions is called from free_vp_pages() - * for an inexact estimate of a newly free'd page... + * Clear p_lckcnt and p_cowcnt, adjusting freemem if required. */ -ulong_t -page_share_cnt(page_t *pp) -{ - return (hat_page_getshare(pp)); -} - -/* - * The following functions are used in handling memory - * errors. - */ - -int -page_istoxic(page_t *pp) -{ - return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC); -} - -int -page_isfailing(page_t *pp) -{ - return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING); -} - -int -page_isretired(page_t *pp) -{ - return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED); -} - int -page_deteriorating(page_t *pp) +page_clear_lck_cow(page_t *pp, int adjust) { - return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0); -} + int f_amount; -void -page_settoxic(page_t *pp, uchar_t flag) -{ - uchar_t new_flag = 0; - while ((new_flag & flag) != flag) { - uchar_t old_flag = pp->p_toxic; - new_flag = old_flag | flag; - (void) cas8(&pp->p_toxic, old_flag, new_flag); - new_flag = ((volatile page_t *)pp)->p_toxic; - } -} + ASSERT(PAGE_EXCL(pp)); -void -page_clrtoxic(page_t *pp) -{ /* - * We don't need to worry about atomicity on the - * p_toxic flag here as this is only called from - * page_free() while holding an exclusive lock on - * the page + * The page_struct_lock need not be acquired here since + * we require the caller hold the page exclusively locked. */ - pp->p_toxic = PAGE_IS_OK; -} + f_amount = 0; + if (pp->p_lckcnt) { + f_amount = 1; + pp->p_lckcnt = 0; + } + if (pp->p_cowcnt) { + f_amount += pp->p_cowcnt; + pp->p_cowcnt = 0; + } -void -page_clrtoxic_flag(page_t *pp, uchar_t flag) -{ - uchar_t new_flag = ((volatile page_t *)pp)->p_toxic; - while ((new_flag & flag) == flag) { - uchar_t old_flag = new_flag; - new_flag = old_flag & ~flag; - (void) cas8(&pp->p_toxic, old_flag, new_flag); - new_flag = ((volatile page_t *)pp)->p_toxic; + if (adjust && f_amount) { + mutex_enter(&freemem_lock); + availrmem += f_amount; + mutex_exit(&freemem_lock); } -} -int -page_isfaulty(page_t *pp) -{ - return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY); + return (f_amount); } /* - * The following four functions are called from /proc code - * for the /proc/<pid>/xmap interface. + * The following functions is called from free_vp_pages() + * for an inexact estimate of a newly free'd page... */ +ulong_t +page_share_cnt(page_t *pp) +{ + return (hat_page_getshare(pp)); +} + int page_isshared(page_t *pp) { diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c index 225e8d157f..994ddca8a6 100644 --- a/usr/src/uts/common/vm/vm_pagelist.c +++ b/usr/src/uts/common/vm/vm_pagelist.c @@ -1305,7 +1305,7 @@ page_list_add_pages(page_t *pp, int flags) kcage_freemem_add(pgcnt); #endif for (i = 0; i < pgcnt; i++, pp++) - page_unlock(pp); + page_unlock_noretire(pp); } } @@ -1753,7 +1753,7 @@ page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); phm = PAGE_HASH_MUTEX(index); if (!mutex_tryenter(phm)) { - page_unlock(pp); + page_unlock_noretire(pp); goto fail_promote; } @@ -1761,7 +1761,7 @@ page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) page_hashout(pp, phm); mutex_exit(phm); PP_SETAGED(pp); - page_unlock(pp); + page_unlock_noretire(pp); which_list = PG_CACHE_LIST; } page_ctr_sub(mnode, mtype, pp, which_list); @@ -2209,7 +2209,7 @@ page_trylock_cons(page_t *pp, se_t se) * have locked so far. */ while (first_pp != tpp) { - page_unlock(first_pp); + page_unlock_noretire(first_pp); first_pp = first_pp->p_next; } return (0); @@ -2575,7 +2575,7 @@ skipptcpcheck: while (--i != (pgcnt_t)-1) { pp = &spp[i]; ASSERT(PAGE_EXCL(pp)); - page_unlock(pp); + page_unlock_noretire(pp); } return (0); } @@ -2584,7 +2584,7 @@ skipptcpcheck: !PP_ISFREE(pp)) { VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); ASSERT(i == 0); - page_unlock(pp); + page_unlock_noretire(pp); return (0); } if (PP_ISNORELOC(pp)) { @@ -2592,7 +2592,7 @@ skipptcpcheck: while (i != (pgcnt_t)-1) { pp = &spp[i]; ASSERT(PAGE_EXCL(pp)); - page_unlock(pp); + page_unlock_noretire(pp); i--; } return (0); @@ -2687,7 +2687,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) */ while (pgcnt--) { ASSERT(PAGE_EXCL(pp)); - page_unlock(pp); + page_unlock_noretire(pp); pp++; } /* @@ -2702,7 +2702,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) ASSERT(PP_ISAGED(pp)); pp->p_szc = 0; page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); - page_unlock(pp); + page_unlock_noretire(pp); } if (replpp != NULL) @@ -2734,7 +2734,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) page_sub(&replpp, rpp); ASSERT(PAGE_EXCL(rpp)); ASSERT(!PP_ISFREE(rpp)); - page_unlock(rpp); + page_unlock_noretire(rpp); } ASSERT(targpp == hpp); ASSERT(replpp == NULL); diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c index 1d6cd158b4..5071dae5ee 100644 --- a/usr/src/uts/i86pc/os/machdep.c +++ b/usr/src/uts/i86pc/os/machdep.c @@ -226,6 +226,8 @@ mdboot(int cmd, int fcn, char *mdep, boolean_t invoke_cb) if (invoke_cb) (void) callb_execute_class(CB_CL_MDBOOT, NULL); + page_retire_hunt(page_retire_mdboot_cb); + /* * stop other cpus and raise our priority. since there is only * one active cpu after this, and our priority will be too high diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c index 35d97e6b23..75f57ce39c 100644 --- a/usr/src/uts/sun4u/cpu/spitfire.c +++ b/usr/src/uts/sun4u/cpu/spitfire.c @@ -432,6 +432,7 @@ void cpu_setup(void) { extern int page_retire_messages; + extern int page_retire_first_ue; extern int at_flags; #if defined(SF_ERRATA_57) extern caddr_t errata57_limit; @@ -445,9 +446,11 @@ cpu_setup(void) /* * Spitfire isn't currently FMA-aware, so we have to enable the - * page retirement messages. + * page retirement messages. We also change the default policy + * for UE retirement to allow clearing of transient errors. */ page_retire_messages = 1; + page_retire_first_ue = 0; /* * save the cache bootup state. @@ -895,10 +898,7 @@ cpu_ce_error(struct regs *rp, ulong_t p_afar, ulong_t p_afsr, curthread->t_ontrap != NULL) { if (curthread->t_ontrap->ot_prot & OT_DATA_EC) { - page_t *pp = page_numtopp_nolock((pfn_t) - (ecc->flt_addr >> MMU_PAGESHIFT)); - - if (pp != NULL && page_isretired(pp)) { + if (page_retire_check(ecc->flt_addr, NULL) == 0) { queue = 0; } } @@ -1093,6 +1093,7 @@ cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep) char unum[UNUM_NAMLEN]; int len = 0; int ce_verbose = 0; + int err; ASSERT(ecc->flt_func != NULL); @@ -1107,15 +1108,9 @@ cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep) * Count errors per unum. * Non-memory errors are all counted via a special unum string. */ - if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING && + if ((err = ce_count_unum(ecc->flt_status, len, unum)) != PR_OK && automatic_page_removal) { - page_t *pp = page_numtopp_nolock((pfn_t) - (ecc->flt_addr >> MMU_PAGESHIFT)); - - if (pp) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_FAILING); - } + (void) page_retire(ecc->flt_addr, err); } if (ecc->flt_panic) { @@ -2092,11 +2087,7 @@ cpu_async_log_err(void *flt) if (!panicstr && (aflt->flt_stat & S_AFSR_ALL_ERRS) == P_AFSR_UE && aflt->flt_prot == AFLT_PROT_EC) { - page_t *pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - - if (pp != NULL && page_isretired(pp)) { - + if (page_retire_check(aflt->flt_addr, NULL) == 0) { /* Zero the address to clear the error */ softcall(ecc_page_zero, (void *)aflt->flt_addr); return; @@ -2305,25 +2296,7 @@ cpu_async_log_err(void *flt) if (aflt->flt_addr != AFLT_INV_ADDR && aflt->flt_in_memory) { if (!panicstr) { - /* - * Retire the bad page that caused the error - */ - page_t *pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - - if (pp != NULL) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_TOXIC); - } else { - uint64_t pa = - P2ALIGN(aflt->flt_addr, MMU_PAGESIZE); - - cpu_aflt_log(CE_CONT, 3, spf_flt, - CPU_ERRID_FIRST, NULL, - ": cannot schedule clearing of error on " - "page 0x%08x.%08x; page not in VM system", - (uint32_t)(pa >> 32), (uint32_t)pa); - } + (void) page_retire(aflt->flt_addr, PR_UE); } else { /* * Clear UEs on panic so that we don't @@ -4089,12 +4062,7 @@ static void ecache_page_retire(void *arg) { uint64_t paddr = (uint64_t)arg; - page_t *pp = page_numtopp_nolock((pfn_t)(paddr >> MMU_PAGESHIFT)); - - if (pp) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_TOXIC); - } + (void) page_retire(paddr, PR_UE); } void @@ -4331,15 +4299,14 @@ add_leaky_bucket_timeout(void) * false intermittents, so these intermittents can be safely ignored. * * If the error count is excessive for a DIMM, this function will return - * PAGE_IS_FAILING, and the CPU module may then decide to remove that page - * from use. + * PR_MCE, and the CPU module may then decide to remove that page from use. */ static int ce_count_unum(int status, int len, char *unum) { int i; struct ce_info *psimm = mem_ce_simm; - int page_status = PAGE_IS_OK; + int page_status = PR_OK; ASSERT(psimm != NULL); @@ -4375,7 +4342,7 @@ ce_count_unum(int status, int len, char *unum) cmn_err(CE_WARN, "[AFT0] Sticky Softerror encountered " "on Memory Module %s\n", unum); - page_status = PAGE_IS_FAILING; + page_status = PR_MCE; } else if (status & ECC_PERSISTENT) { psimm[i].leaky_bucket_cnt = 1; psimm[i].intermittent_total = 0; @@ -4404,7 +4371,7 @@ ce_count_unum(int status, int len, char *unum) cmn_err(CE_WARN, "[AFT0] Sticky Softerror encountered " "on Memory Module %s\n", unum); - page_status = PAGE_IS_FAILING; + page_status = PR_MCE; } else if (status & ECC_PERSISTENT) { int new_value; @@ -4422,7 +4389,7 @@ ce_count_unum(int status, int len, char *unum) ecc_softerr_interval % 60); atomic_add_16( &psimm[i].leaky_bucket_cnt, -1); - page_status = PAGE_IS_FAILING; + page_status = PR_MCE; } } else { /* Intermittent */ psimm[i].intermittent_total++; @@ -4444,15 +4411,11 @@ ce_count_unum(int status, int len, char *unum) void cpu_ce_count_unum(struct async_flt *ecc, int len, char *unum) { - if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING && - automatic_page_removal) { - page_t *pp = page_numtopp_nolock((pfn_t) - (ecc->flt_addr >> MMU_PAGESHIFT)); + int err; - if (pp) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_FAILING); - } + err = ce_count_unum(ecc->flt_status, len, unum); + if (err != PR_OK && automatic_page_removal) { + (void) page_retire(ecc->flt_addr, err); } } diff --git a/usr/src/uts/sun4u/cpu/us3_common.c b/usr/src/uts/sun4u/cpu/us3_common.c index f8d8b2bb77..f7cc35c664 100644 --- a/usr/src/uts/sun4u/cpu/us3_common.c +++ b/usr/src/uts/sun4u/cpu/us3_common.c @@ -2205,7 +2205,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) { ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt; struct async_flt *aflt = (struct async_flt *)flt; - page_t *pp; + uint64_t errors; switch (ch_flt->flt_type) { case CPU_INV_AFSR: @@ -2236,9 +2236,6 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) */ case CPU_CE: case CPU_EMC: - pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - /* * We want to skip logging and further classification * only if ALL the following conditions are true: @@ -2258,7 +2255,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE && aflt->flt_prot == AFLT_PROT_EC) { - if (pp != NULL && page_isretired(pp)) { + if (page_retire_check(aflt->flt_addr, NULL) == 0) { if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) { /* @@ -2289,17 +2286,17 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) * * Note: Check cpu_impl_async_log_err if changing this */ - if (pp) { - if (page_isretired(pp) || page_deteriorating(pp)) { + if (page_retire_check(aflt->flt_addr, &errors) == EINVAL) { + CE_XDIAG_SETSKIPCODE(aflt->flt_disp, + CE_XDIAG_SKIP_NOPP); + } else { + if (errors != PR_OK) { CE_XDIAG_SETSKIPCODE(aflt->flt_disp, CE_XDIAG_SKIP_PAGEDET); } else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep, offsetof(ch_async_flt_t, cmn_asyncflt))) { return (0); } - } else { - CE_XDIAG_SETSKIPCODE(aflt->flt_disp, - CE_XDIAG_SKIP_NOPP); } /*FALLTHRU*/ @@ -2325,11 +2322,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep) if (!panicstr && (ch_flt->afsr_errs & (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_UE && aflt->flt_prot == AFLT_PROT_EC) { - page_t *pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - - if (pp != NULL && page_isretired(pp)) { - + if (page_retire_check(aflt->flt_addr, NULL) == 0) { /* Zero the address to clear the error */ softcall(ecc_page_zero, (void *)aflt->flt_addr); return (0); @@ -2387,12 +2380,7 @@ void cpu_page_retire(ch_async_flt_t *ch_flt) { struct async_flt *aflt = (struct async_flt *)ch_flt; - page_t *pp = page_numtopp_nolock(aflt->flt_addr >> MMU_PAGESHIFT); - - if (pp != NULL) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_TOXIC); - } + (void) page_retire(aflt->flt_addr, PR_UE); } /* diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno.c b/usr/src/uts/sun4u/cpu/us3_jalapeno.c index cd71848200..0b7936d426 100644 --- a/usr/src/uts/sun4u/cpu/us3_jalapeno.c +++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c @@ -316,7 +316,7 @@ cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep) { ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt; struct async_flt *aflt = (struct async_flt *)flt; - page_t *pp; + uint64_t errors; switch (ch_flt->flt_type) { @@ -329,19 +329,15 @@ cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep) return (CH_ASYNC_LOG_DONE); case CPU_RCE: - pp = page_numtopp_nolock((pfn_t) - (aflt->flt_addr >> MMU_PAGESHIFT)); - if (pp) { - if (page_isretired(pp) || page_deteriorating(pp)) { - CE_XDIAG_SETSKIPCODE(aflt->flt_disp, - CE_XDIAG_SKIP_PAGEDET); - } else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep, - offsetof(ch_async_flt_t, cmn_asyncflt))) { - return (CH_ASYNC_LOG_RECIRC); - } - } else { + if (page_retire_check(aflt->flt_addr, &errors) == EINVAL) { CE_XDIAG_SETSKIPCODE(aflt->flt_disp, CE_XDIAG_SKIP_NOPP); + } else if (errors != PR_OK) { + CE_XDIAG_SETSKIPCODE(aflt->flt_disp, + CE_XDIAG_SKIP_PAGEDET); + } else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep, + offsetof(ch_async_flt_t, cmn_asyncflt))) { + return (CH_ASYNC_LOG_RECIRC); } /*FALLTHRU*/ /* diff --git a/usr/src/uts/sun4u/io/pci/pci_ecc.c b/usr/src/uts/sun4u/io/pci/pci_ecc.c index 0f92d73663..8820683ba6 100644 --- a/usr/src/uts/sun4u/io/pci/pci_ecc.c +++ b/usr/src/uts/sun4u/io/pci/pci_ecc.c @@ -534,21 +534,21 @@ ecc_err_handler(ecc_errstate_t *ecc_err_p) * Called from ecc_err_drain below for CBINTR_CE case. */ static int -ecc_err_cexdiag(page_t *pp, ecc_errstate_t *ecc_err, - errorq_elem_t *eqep) +ecc_err_cexdiag(ecc_errstate_t *ecc_err, errorq_elem_t *eqep) { struct async_flt *ecc = &ecc_err->ecc_aflt; + uint64_t errors; - if (!pp) { + if (page_retire_check(ecc->flt_addr, &errors) == EINVAL) { CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP); return (0); - } else if (page_isretired(pp) || page_deteriorating(pp)) { + } else if (errors != PR_OK) { CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET); return (0); + } else { + return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep, + offsetof(ecc_errstate_t, ecc_aflt))); } - - return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep, - offsetof(ecc_errstate_t, ecc_aflt))); } /* @@ -561,7 +561,6 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep) { struct async_flt *ecc = &ecc_err->ecc_aflt; pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0]; - page_t *pp; int ecc_type = ecc_err->ecc_ii_p.ecc_type; if (pci_p == NULL) @@ -581,13 +580,10 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep) ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ? ECC_IO_UE : ECC_IO_CE); - pp = page_numtopp_nolock(ecc->flt_addr >> MMU_PAGESHIFT); - switch (ecc_type) { case CBNINTR_UE: - if (pp && ecc_err->ecc_pg_ret == 1) { - page_settoxic(pp, PAGE_IS_FAULTY); - (void) page_retire(pp, PAGE_IS_TOXIC); + if (ecc_err->ecc_pg_ret == 1) { + (void) page_retire(ecc->flt_addr, PR_UE); } ecc_err->ecc_err_type = flt_to_error_type(ecc); break; @@ -609,7 +605,7 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep) /* ecc_err_cexdiag returns nonzero to recirculate */ if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) && - ecc_err_cexdiag(pp, ecc_err, eqep)) + ecc_err_cexdiag(ecc_err, eqep)) return; ecc_err->ecc_err_type = flt_to_error_type(ecc); break; diff --git a/usr/src/uts/sun4u/ngdr/io/dr_mem.c b/usr/src/uts/sun4u/ngdr/io/dr_mem.c index e876db93b5..1dd67f5824 100644 --- a/usr/src/uts/sun4u/ngdr/io/dr_mem.c +++ b/usr/src/uts/sun4u/ngdr/io/dr_mem.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -80,8 +80,6 @@ static struct memlist *memlist_del_span(struct memlist *mlist, static struct memlist *memlist_cat_span(struct memlist *mlist, uint64_t base, uint64_t len); -extern void page_unretire_pages(void); - /* * dr_mem_unit_t.sbm_flags */ @@ -427,57 +425,13 @@ dr_mem_ecache_scrub(dr_mem_unit_t *mp, struct memlist *mlist) #endif /* DEBUG */ } -/* - * This function marks as clean, all the faulty pages that belong to the - * board that is copy-renamed since they are not likely to be bad pages - * after the rename. This includes the retired pages on the board. - */ - -static void -dr_memlist_clrpages(struct memlist *r_ml) -{ - struct memlist *t_ml; - page_t *pp, *epp; - pfn_t pfn, epfn; - struct memseg *seg; - - if (r_ml == NULL) - return; - - for (t_ml = r_ml; (t_ml != NULL); t_ml = t_ml->next) { - pfn = _b64top(t_ml->address); - epfn = _b64top(t_ml->address + t_ml->size); - - for (seg = memsegs; seg != NULL; seg = seg->next) { - if (pfn >= seg->pages_end || epfn < seg->pages_base) - continue; - - pp = seg->pages; - if (pfn > seg->pages_base) - pp += pfn - seg->pages_base; - - epp = seg->epages; - if (epfn < seg->pages_end) - epp -= seg->pages_end - epfn; - - ASSERT(pp < epp); - while (pp < epp) { - if (page_isfaulty((page_t *)pp)) - page_clrtoxic_flag((page_t *)pp, - PAGE_IS_FAULTY); - pp++; - } - } - } -} - static int dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp) { time_t copytime; drmachid_t cr_id; dr_sr_handle_t *srhp; - struct memlist *c_ml, *d_ml, *r_ml; + struct memlist *c_ml, *d_ml; sbd_error_t *err; static fn_t f = "dr_move_memory"; @@ -507,11 +461,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp) d_ml = d_ml->next; } - /* - * create a copy of the memlist to be used for retiring pages. - */ - r_ml = memlist_dup(c_ml); - affinity_set(drmach_mem_cpu_affinity(t_mp->sbm_cm.sbdev_id)); err = drmach_copy_rename_init( @@ -520,7 +469,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp) if (err) { DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err); affinity_clear(); - memlist_delete(r_ml); return (-1); } @@ -553,7 +501,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp) hp->h_err = NULL; affinity_clear(); - memlist_delete(r_ml); return (-1); } @@ -573,12 +520,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp) drmach_copy_rename(cr_id); - /* - * Clear pages that have been marked as faulty since we are - * changing the physical memory for the pages. - */ - dr_memlist_clrpages(r_ml); - /* Resume the OS. */ dr_resume(srhp); @@ -594,11 +535,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp) PR_MEM("%s: copy-rename elapsed time = %ld ticks (%ld secs)\n", f, copytime, copytime / hz); - memlist_delete(r_ml); - - /* Unretire any pages cleared after copy-rename */ - page_unretire_pages(); - /* return -1 if dr_suspend or copy/rename recorded an error */ return (err == NULL ? 0 : -1); } diff --git a/usr/src/uts/sun4u/os/ecc.c b/usr/src/uts/sun4u/os/ecc.c index 10b6cb523f..bd933377e4 100644 --- a/usr/src/uts/sun4u/os/ecc.c +++ b/usr/src/uts/sun4u/os/ecc.c @@ -247,23 +247,21 @@ error_init(void) } /* - * Success flags for ecc_page_zero + * Flags for ecc_page_zero DTrace probe since ecc_page_zero() is called + * as a softint handler. */ #define PAGE_ZERO_SUCCESS 0 #define PAGE_ZERO_FAIL_NOLOCK 1 #define PAGE_ZERO_FAIL_ONTRAP 2 -/* - * arg is a physical address - zero out the page that contains it - */ void ecc_page_zero(void *arg) { uint64_t pa = (uint64_t)arg; - page_t *pp = page_numtopp_nolock((pfn_t)(pa >> MMU_PAGESHIFT)); int ret, success_flag; + page_t *pp = page_numtopp_nolock(mmu_btop(pa)); - if (pp == NULL || !page_isretired(pp)) + if (page_retire_check(pa, NULL) != 0) return; /* diff --git a/usr/src/uts/sun4u/os/mach_cpu_states.c b/usr/src/uts/sun4u/os/mach_cpu_states.c index 0815f54170..4144c91c79 100644 --- a/usr/src/uts/sun4u/os/mach_cpu_states.c +++ b/usr/src/uts/sun4u/os/mach_cpu_states.c @@ -66,7 +66,6 @@ extern int disable_watchdog_on_exit; void mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb) { - page_t *first, *pp; extern void pm_cfb_check_and_powerup(void); /* @@ -79,25 +78,6 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb) } /* - * Clear any unresolved UEs from memory. We rely on the fact that on - * sun4u, pagezero() will always clear UEs. Since we're rebooting, we - * just force p_selock to appear locked so pagezero()'s assert works. - * - * Pages that were retired successfully due to multiple CEs will - * also be cleared. - */ - if (memsegs != NULL) { - pp = first = page_first(); - do { - if (page_isretired(pp) || page_istoxic(pp)) { - /* pagezero asserts PAGE_LOCKED */ - pp->p_selock = -1; - pagezero(pp, 0, PAGESIZE); - } - } while ((pp = page_next(pp)) != first); - } - - /* * XXX - rconsvp is set to NULL to ensure that output messages * are sent to the underlying "hardware" device using the * monitor's printf routine since we are in the process of @@ -123,6 +103,12 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb) (void) callb_execute_class(CB_CL_MDBOOT, NULL); /* + * Clear any unresolved UEs from memory. + */ + if (memsegs != NULL) + page_retire_hunt(page_retire_mdboot_cb); + + /* * stop other cpus which also raise our priority. since there is only * one active cpu after this, and our priority will be too high * for us to be preempted, we're essentially single threaded diff --git a/usr/src/uts/sun4v/os/error.c b/usr/src/uts/sun4v/os/error.c index 9d13b1781b..bd2b7fde49 100644 --- a/usr/src/uts/sun4v/os/error.c +++ b/usr/src/uts/sun4v/os/error.c @@ -87,8 +87,7 @@ static uint32_t rq_overflow_count = 0; /* counter for rq overflow */ static void cpu_queue_one_event(errh_async_flt_t *); static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t); -static void errh_page_settoxic(errh_async_flt_t *, uchar_t); -static void errh_page_retire(errh_async_flt_t *); +static void errh_page_retire(errh_async_flt_t *, uchar_t); static int errh_error_protected(struct regs *, struct async_flt *, int *); static void errh_rq_full(struct async_flt *); static void ue_drain(void *, struct async_flt *, errorq_elem_t *); @@ -300,12 +299,10 @@ process_nonresumable_error(struct regs *rp, uint64_t tl, } /* - * If it is a memory error, we turn on the PAGE_IS_TOXIC - * flag. The page will be retired later and scrubbed when - * it is freed. + * Call page_retire() to handle memory errors. */ if (errh_flt.errh_er.attr & ERRH_ATTR_MEM) - (void) errh_page_settoxic(&errh_flt, PAGE_IS_TOXIC); + errh_page_retire(&errh_flt, PR_UE); /* * If we queued an error and the it was in user mode or @@ -443,10 +440,10 @@ cpu_async_log_err(void *flt) case ERRH_DESC_UCOR_RE: if (errh_erp->attr & ERRH_ATTR_MEM) { /* - * Turn on the PAGE_IS_TOXIC flag. The page will be + * Turn on the PR_UE flag. The page will be * scrubbed when it is freed. */ - (void) errh_page_settoxic(errh_fltp, PAGE_IS_TOXIC); + errh_page_retire(errh_fltp, PR_UE); } break; @@ -458,7 +455,7 @@ cpu_async_log_err(void *flt) * For non-resumable memory error, retire * the page here. */ - errh_page_retire(errh_fltp); + errh_page_retire(errh_fltp, PR_UE); /* * If we are going to panic, scrub the page first @@ -518,9 +515,8 @@ cpu_ue_log_err(struct async_flt *aflt) * Turn on flag on the error memory region. */ static void -errh_page_settoxic(errh_async_flt_t *errh_fltp, uchar_t flag) +errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag) { - page_t *pp; uint64_t flt_real_addr_start = errh_fltp->errh_er.ra; uint64_t flt_real_addr_end = flt_real_addr_start + errh_fltp->errh_er.sz - 1; @@ -531,38 +527,7 @@ errh_page_settoxic(errh_async_flt_t *errh_fltp, uchar_t flag) for (current_addr = flt_real_addr_start; current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) { - pp = page_numtopp_nolock((pfn_t) - (current_addr >> MMU_PAGESHIFT)); - - if (pp != NULL) { - page_settoxic(pp, flag); - } - } -} - -/* - * Retire the page(s) indicated in the error report. - */ -static void -errh_page_retire(errh_async_flt_t *errh_fltp) -{ - page_t *pp; - uint64_t flt_real_addr_start = errh_fltp->errh_er.ra; - uint64_t flt_real_addr_end = flt_real_addr_start + - errh_fltp->errh_er.sz - 1; - int64_t current_addr; - - if (errh_fltp->errh_er.sz == 0) - return; - - for (current_addr = flt_real_addr_start; - current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) { - pp = page_numtopp_nolock((pfn_t) - (current_addr >> MMU_PAGESHIFT)); - - if (pp != NULL) { - (void) page_retire(pp, PAGE_IS_TOXIC); - } + (void) page_retire(current_addr, flag); } } diff --git a/usr/src/uts/sun4v/os/mach_cpu_states.c b/usr/src/uts/sun4v/os/mach_cpu_states.c index 75e2421e69..46c1fdbeff 100644 --- a/usr/src/uts/sun4v/os/mach_cpu_states.c +++ b/usr/src/uts/sun4v/os/mach_cpu_states.c @@ -106,29 +106,9 @@ extern uint64_t get_cpuaddr(uint64_t, uint64_t); void mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb) { - page_t *first, *pp; extern void pm_cfb_check_and_powerup(void); /* - * Clear any unresolved UEs from memory. We rely on the fact that on - * sun4u, pagezero() will always clear UEs. Since we're rebooting, we - * just force p_selock to appear locked so pagezero()'s assert works. - * - * Pages that were retired successfully due to multiple CEs will - * also be cleared. - */ - if (memsegs != NULL) { - pp = first = page_first(); - do { - if (page_isretired(pp) || page_istoxic(pp)) { - /* pagezero asserts PAGE_LOCKED */ - pp->p_selock = -1; - pagezero(pp, 0, PAGESIZE); - } - } while ((pp = page_next(pp)) != first); - } - - /* * XXX - rconsvp is set to NULL to ensure that output messages * are sent to the underlying "hardware" device using the * monitor's printf routine since we are in the process of @@ -154,6 +134,12 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb) (void) callb_execute_class(CB_CL_MDBOOT, NULL); /* + * Clear any unresolved UEs from memory. + */ + if (memsegs != NULL) + page_retire_hunt(page_retire_mdboot_cb); + + /* * stop other cpus which also raise our priority. since there is only * one active cpu after this, and our priority will be too high * for us to be preempted, we're essentially single threaded |