summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/io/mem.c63
-rw-r--r--usr/src/uts/common/os/mem_config.c67
-rw-r--r--usr/src/uts/common/sys/mem.h20
-rw-r--r--usr/src/uts/common/vm/page.h110
-rw-r--r--usr/src/uts/common/vm/page_lock.c198
-rw-r--r--usr/src/uts/common/vm/page_retire.c1473
-rw-r--r--usr/src/uts/common/vm/vm_page.c850
-rw-r--r--usr/src/uts/common/vm/vm_pagelist.c20
-rw-r--r--usr/src/uts/i86pc/os/machdep.c2
-rw-r--r--usr/src/uts/sun4u/cpu/spitfire.c77
-rw-r--r--usr/src/uts/sun4u/cpu/us3_common.c30
-rw-r--r--usr/src/uts/sun4u/cpu/us3_jalapeno.c20
-rw-r--r--usr/src/uts/sun4u/io/pci/pci_ecc.c24
-rw-r--r--usr/src/uts/sun4u/ngdr/io/dr_mem.c68
-rw-r--r--usr/src/uts/sun4u/os/ecc.c10
-rw-r--r--usr/src/uts/sun4u/os/mach_cpu_states.c26
-rw-r--r--usr/src/uts/sun4v/os/error.c51
-rw-r--r--usr/src/uts/sun4v/os/mach_cpu_states.c26
19 files changed, 1986 insertions, 1150 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 57b3f1968f..32c38cdac3 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -51,6 +51,7 @@ COMMON_CORE_OBJS += \
lgrp_topo.o \
mutex.o \
page_lock.o \
+ page_retire.o \
panic.o \
param.o \
putnext.o \
diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c
index 3aa34f9427..1e42907a5d 100644
--- a/usr/src/uts/common/io/mem.c
+++ b/usr/src/uts/common/io/mem.c
@@ -460,47 +460,51 @@ mmioctl_vtop(intptr_t data)
}
/*
- * Given a PA, retire that page or check whether it has already been retired.
+ * Given a PA, execute the given page retire command on it.
*/
static int
mmioctl_page_retire(int cmd, intptr_t data)
{
+ extern int page_retire_test(void);
uint64_t pa;
- pfn_t pfn;
- page_t *pp;
- if (copyin((void *)data, &pa, sizeof (uint64_t)))
+ if (copyin((void *)data, &pa, sizeof (uint64_t))) {
return (EFAULT);
+ }
- pfn = pa >> MMU_PAGESHIFT;
+ switch (cmd) {
+ case MEM_PAGE_ISRETIRED:
+ return (page_retire_check(pa, NULL));
- if (!pf_is_memory(pfn) || (pp = page_numtopp_nolock(pfn)) == NULL)
- return (EINVAL);
+ case MEM_PAGE_UNRETIRE:
+ return (page_unretire(pa));
- /*
- * If we're checking, see if the page is retired; if not, confirm that
- * its status is at least set to be failing. If neither, return EIO.
- */
- if (cmd == MEM_PAGE_ISRETIRED) {
- if (page_isretired(pp))
- return (0);
+ case MEM_PAGE_RETIRE:
+ return (page_retire(pa, PR_FMA));
- if (!page_isfailing(pp))
- return (EIO);
+ case MEM_PAGE_RETIRE_MCE:
+ return (page_retire(pa, PR_MCE));
- return (EAGAIN);
- }
+ case MEM_PAGE_RETIRE_UE:
+ return (page_retire(pa, PR_UE));
- /*
- * Try to retire the page. If the retire fails, it will be scheduled to
- * occur when the page is freed. If this page is out of circulation
- * already, or is in the process of being retired, we fail.
- */
- if (page_isretired(pp) || page_isfailing(pp))
- return (EIO);
+ case MEM_PAGE_GETERRORS:
+ {
+ uint64_t page_errors;
+ int rc = page_retire_check(pa, &page_errors);
+ if (copyout(&page_errors, (void *)data,
+ sizeof (uint64_t))) {
+ return (EFAULT);
+ }
+ return (rc);
+ }
+
+ case MEM_PAGE_RETIRE_TEST:
+ return (page_retire_test());
+
+ }
- page_settoxic(pp, PAGE_IS_FAULTY);
- return (page_retire(pp, PAGE_IS_FAILING) ? EAGAIN : 0);
+ return (EINVAL);
}
#ifdef __sparc
@@ -606,6 +610,11 @@ mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
case MEM_PAGE_RETIRE:
case MEM_PAGE_ISRETIRED:
+ case MEM_PAGE_UNRETIRE:
+ case MEM_PAGE_RETIRE_MCE:
+ case MEM_PAGE_RETIRE_UE:
+ case MEM_PAGE_GETERRORS:
+ case MEM_PAGE_RETIRE_TEST:
if (getminor(dev) != M_MEM)
return (ENXIO);
return (mmioctl_page_retire(cmd, data));
diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c
index 0d29cc59d6..8f398ac602 100644
--- a/usr/src/uts/common/os/mem_config.c
+++ b/usr/src/uts/common/os/mem_config.c
@@ -1770,31 +1770,13 @@ delete_memory_thread(caddr_t amhp)
}
if (!page_try_reclaim_lock(pp, SE_EXCL,
- SE_EXCL_WANTED)) {
- if (page_isretired(pp)) {
- /*
- * Page has been retired.
- *
- * Its shared lock can and
- * must be upgraded to an
- * exclusive lock in order
- * to hashout the page when
- * the delete completes.
- */
- page_lock_clr_exclwanted(pp);
- if (!page_tryupgrade(pp)) {
- mutex_enter(
- &mhp->mh_mutex);
- continue;
- }
- } else {
- /*
- * Page in use elsewhere.
- */
- MDSTAT_INCR(mhp, lockfail);
- mutex_enter(&mhp->mh_mutex);
- continue;
- }
+ SE_EXCL_WANTED | SE_RETIRED)) {
+ /*
+ * Page in use elsewhere. Skip it.
+ */
+ MDSTAT_INCR(mhp, lockfail);
+ mutex_enter(&mhp->mh_mutex);
+ continue;
}
/*
* See if the cage expanded into the delete.
@@ -1802,15 +1784,12 @@ delete_memory_thread(caddr_t amhp)
* cage to expand.
*/
if (PP_ISNORELOC(pp)) {
- if (page_isretired(pp))
- page_downgrade(pp);
- else
- page_unlock(pp);
+ page_unlock(pp);
mutex_enter(&mhp->mh_mutex);
mhp->mh_cancel = KPHYSM_ENONRELOC;
break;
}
- if (page_isretired(pp)) {
+ if (PP_RETIRED(pp)) {
/*
* Page has been retired and is
* not part of the cage so we
@@ -1861,11 +1840,11 @@ delete_memory_thread(caddr_t amhp)
}
/*
* Keep stats on pages encountered that
- * are toxic or failing but not retired.
+ * are marked for retirement.
*/
- if (page_istoxic(pp)) {
+ if (PP_TOXIC(pp)) {
MDSTAT_INCR(mhp, toxic);
- } else if (page_isfailing(pp)) {
+ } else if (PP_PR_REQ(pp)) {
MDSTAT_INCR(mhp, failing);
}
/*
@@ -1876,7 +1855,7 @@ delete_memory_thread(caddr_t amhp)
* previously associated with the page.
*/
if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
- if (!page_istoxic(pp)) {
+ if (!PP_TOXIC(pp)) {
/*
* Must relocate locked in
* memory pages.
@@ -1949,7 +1928,7 @@ delete_memory_thread(caddr_t amhp)
#ifdef MEM_DEL_STATS
start_pgrp = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */
- if (mod && !page_istoxic(pp)) {
+ if (mod && !PP_TOXIC(pp)) {
/*
* Lock all constituent pages
* of a large page to ensure
@@ -2020,7 +1999,7 @@ delete_memory_thread(caddr_t amhp)
* set, we cannot do anything here to deal
* with it.
*/
- if (page_istoxic(pp)) {
+ if (PP_TOXIC(pp)) {
page_unlock(pp);
#ifdef MEM_DEL_STATS
ntick_pgrp = (uint64_t)ddi_get_lbolt() -
@@ -2067,7 +2046,7 @@ delete_memory_thread(caddr_t amhp)
continue;
}
if (page_try_reclaim_lock(pp, SE_EXCL,
- SE_EXCL_WANTED)) {
+ SE_EXCL_WANTED | SE_RETIRED)) {
if (PP_ISFREE(pp)) {
goto free_page_collect;
}
@@ -2229,12 +2208,8 @@ delete_memory_thread(caddr_t amhp)
/*
* If the memory delete was cancelled, exclusive-wanted bits must
- * be cleared, and also any retired pages that
- * were accounted for above must have their exclusive lock
- * downgraded to a shared lock to return them to their previous
- * state.
- * Otherwise, if the memory delete has completed, retired pages
- * must be hashed out.
+ * be cleared. If there are retired pages being deleted, they need
+ * to be unretired.
*/
for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
mdsp = mdsp->mds_next) {
@@ -2264,16 +2239,16 @@ delete_memory_thread(caddr_t amhp)
pp = page_numtopp_nolock(pfn);
}
ASSERT(pp != NULL);
- ASSERT(page_isretired(pp));
+ ASSERT(PP_RETIRED(pp));
if (mhp->mh_cancel != 0) {
- page_downgrade(pp);
+ page_unlock(pp);
/*
* To satisfy ASSERT below in
* cancel code.
*/
mhp->mh_hold_todo++;
} else {
- page_hashout(pp, (kmutex_t *)NULL);
+ (void) page_unretire_pp(pp, 0);
}
}
}
diff --git a/usr/src/uts/common/sys/mem.h b/usr/src/uts/common/sys/mem.h
index e741d56b9f..f2b23b8029 100644
--- a/usr/src/uts/common/sys/mem.h
+++ b/usr/src/uts/common/sys/mem.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -60,11 +60,25 @@ typedef struct mem_vtop {
* and drivers should not make use of these interfaces: they can change without
* notice and programs that consume them will fail to run on future releases.
*/
-#define MEM_PAGE_RETIRE (('M' << 8) | 0x02)
-#define MEM_PAGE_ISRETIRED (('M' << 8) | 0x03)
#define MEM_NAME (('M' << 8) | 0x04)
#define MEM_INFO (('M' << 8) | 0x05)
+#define MEM_PAGE_RETIRE (('M' << 8) | 0x02)
+#define MEM_PAGE_ISRETIRED (('M' << 8) | 0x03)
+#define MEM_PAGE_UNRETIRE (('M' << 8) | 0x06)
+#define MEM_PAGE_GETERRORS (('M' << 8) | 0x07)
+#define MEM_PAGE_RETIRE_MCE (('M' << 8) | 0x08)
+#define MEM_PAGE_RETIRE_UE (('M' << 8) | 0x09)
+#define MEM_PAGE_RETIRE_TEST (('M' << 8) | 0x0A)
+
+/*
+ * Bits returned from MEM_PAGE_GETERRORS ioctl for use by fmd(1M).
+ */
+#define MEM_PAGE_ERR_NONE 0x0
+#define MEM_PAGE_ERR_MULTI_CE 0x1
+#define MEM_PAGE_ERR_UE 0x2
+#define MEM_PAGE_ERR_FMA_REQ 0x8
+
typedef struct mem_name {
uint64_t m_addr; /* memory address */
uint64_t m_synd; /* architecture-specific syndrome */
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 2e4183bdc0..c1db6f1391 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -76,6 +76,12 @@ typedef enum {
*/
#define SE_EXCL_WANTED 0x02
+/*
+ * All page_*lock() requests will be denied unless this flag is set in
+ * the 'es' parameter.
+ */
+#define SE_RETIRED 0x04
+
#endif /* _KERNEL | _KMEMUSER */
typedef int selock_t;
@@ -630,37 +636,6 @@ struct lgrp;
#define PG_LIST_ISCAGE 0x2000
/*
- * Flags for setting the p_toxic flag when a page has errors
- * These flags may be OR'ed into the p_toxic page flag to
- * indicate that error(s) have occurred on a page,
- * (see page_settoxic()). If both PAGE_IS_TOXIC and
- * PAGE_IS_FAILING are set, PAGE_IS_FAILING takes precedence.
- *
- * When an error happens on a page, the trap handler sets
- * PAGE_IS_FAULTY on the page to indicate that an error has been
- * seen on the page. The error could be really a memory error or
- * something else (like a datapath error). When it is determined
- * that it is a memory error, the page is marked as PAGE_IS_TOXIC
- * or PAGE_IS_FAILING depending on the type of error and then
- * retired.
- *
- * We use the page's 'toxic' flag to determine whether the page
- * has just got a single error - PAGE_IS_TOXIC - or is being
- * retired due to multiple soft errors - PAGE_IS_FAILING. In
- * page_free(), a page that has been marked PAGE_IS_FAILING will
- * not be cleaned, it will always be retired. A page marked
- * PAGE_IS_TOXIC is cleaned and is retired only if this attempt at
- * cleaning fails.
- *
- * When a page has been successfully retired, we set PAGE_IS_RETIRED.
- */
-#define PAGE_IS_OK 0x0
-#define PAGE_IS_TOXIC 0x1
-#define PAGE_IS_FAILING 0x2
-#define PAGE_IS_RETIRED 0x4
-#define PAGE_IS_FAULTY 0x8
-
-/*
* Page frame operations.
*/
page_t *page_lookup(struct vnode *, u_offset_t, se_t);
@@ -707,6 +682,7 @@ void page_boot_demote(page_t *);
void page_promote_size(page_t *, uint_t);
void page_list_add_pages(page_t *, int);
void page_list_sub(page_t *, int);
+void page_list_sub_pages(page_t *, uint_t);
void page_list_xfer(page_t *, int, int);
void page_list_break(page_t **, page_t **, size_t);
void page_list_concat(page_t **, page_t **);
@@ -720,6 +696,7 @@ int page_try_reclaim_lock(page_t *, se_t, int);
int page_tryupgrade(page_t *);
void page_downgrade(page_t *);
void page_unlock(page_t *);
+void page_unlock_noretire(page_t *);
void page_lock_delete(page_t *);
int page_pp_lock(page_t *, int, int);
void page_pp_unlock(page_t *, int, int);
@@ -759,19 +736,22 @@ int page_isfree(page_t *);
int page_isref(page_t *);
int page_ismod(page_t *);
int page_release(page_t *, int);
-int page_retire(page_t *, uchar_t);
-int page_istoxic(page_t *);
-int page_isfailing(page_t *);
-int page_isretired(page_t *);
-int page_deteriorating(page_t *);
+void page_retire_init(void);
+int page_retire(uint64_t, uchar_t);
+int page_retire_check(uint64_t, uint64_t *);
+int page_unretire(uint64_t);
+int page_unretire_pp(page_t *, int);
+void page_tryretire(page_t *);
+void page_retire_hunt(void (*)(page_t *));
+void page_retire_mdboot_cb(page_t *);
+void page_clrtoxic(page_t *, uchar_t);
void page_settoxic(page_t *, uchar_t);
-void page_clrtoxic(page_t *);
-void page_clrtoxic_flag(page_t *, uchar_t);
-int page_isfaulty(page_t *);
+
int page_mem_avail(pgcnt_t);
void page_set_props(page_t *, uint_t);
void page_clr_all_props(page_t *);
+int page_clear_lck_cow(page_t *, int);
kmutex_t *page_vnode_mutex(struct vnode *);
kmutex_t *page_se_mutex(struct page *);
@@ -792,6 +772,7 @@ void page_free_replacement_page(page_t *);
int page_relocate_cage(page_t **, page_t **);
int page_try_demote_pages(page_t *);
+int page_try_demote_free_pages(page_t *);
void page_demote_free_pages(page_t *);
struct anon_map;
@@ -879,7 +860,56 @@ int page_szc_user_filtered(size_t);
#define PP_CLRMIGRATE(pp) ((pp)->p_state &= ~P_MIGRATE)
#define PP_CLRSWAP(pp) ((pp)->p_state &= ~P_SWAP)
-
+/*
+ * Flags for page_t p_toxic, for tracking memory hardware errors.
+ *
+ * These flags are OR'ed into p_toxic with page_settoxic() to track which
+ * error(s) have occurred on a given page. The flags are cleared with
+ * page_clrtoxic(). Both page_settoxic() and page_cleartoxic use atomic
+ * primitives to manipulate the p_toxic field so no other locking is needed.
+ *
+ * When an error occurs on a page, p_toxic is set to record the error. The
+ * error could be a memory error or something else (i.e. a datapath). The Page
+ * Retire mechanism does not try to determine the exact cause of the error;
+ * Page Retire rightly leaves that sort of determination to FMA's Diagnostic
+ * Engine (DE).
+ *
+ * Note that, while p_toxic bits can be set without holding any locks, they
+ * should only be cleared while holding the page exclusively locked.
+ *
+ * Pages with PR_UE or PR_FMA flags are retired unconditionally, while pages
+ * with PR_MCE are retired if the system has not retired too many of them.
+ *
+ * A page must be exclusively locked to be retired. Pages can be retired if
+ * they are mapped, modified, or both, as long as they are not marked PR_UE,
+ * since pages with uncorrectable errors cannot be relocated in memory.
+ * Once a page has been successfully retired it is zeroed, attached to the
+ * retired_pages vnode and, finally, PR_RETIRED is set in p_toxic. The other
+ * p_toxic bits are NOT cleared. Pages are not left locked after retiring them
+ * to avoid special case code throughout the kernel; rather, page_*lock() will
+ * fail to lock the page, unless SE_RETIRED is passed as an argument.
+ *
+ * While we have your attention, go take a look at the comments at the
+ * beginning of page_retire.c too.
+ */
+#define PR_OK 0x00 /* no problem */
+#define PR_MCE 0x01 /* page has seen two or more CEs */
+#define PR_UE 0x02 /* page has an unhandled UE */
+#define PR_UE_SCRUBBED 0x04 /* page has seen a UE but was cleaned */
+#define PR_FMA 0x08 /* A DE wants this page retired */
+#define PR_RESV 0x10 /* Reserved for future use */
+#define PR_BUSY 0x20 /* Page retire is in progress */
+#define PR_MSG 0x40 /* message(s) already printed for this page */
+#define PR_RETIRED 0x80 /* This page has been retired */
+
+#define PR_REASONS (PR_UE | PR_MCE | PR_FMA)
+#define PR_TOXIC (PR_UE)
+#define PR_ERRMASK (PR_UE | PR_UE_SCRUBBED | PR_MCE | PR_FMA)
+#define PR_ALLFLAGS (0xFF)
+
+#define PP_RETIRED(pp) ((pp)->p_toxic & PR_RETIRED)
+#define PP_TOXIC(pp) ((pp)->p_toxic & PR_TOXIC)
+#define PP_PR_REQ(pp) (((pp)->p_toxic & PR_REASONS) && !PP_RETIRED(pp))
/*
* kpm large page description.
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
index 9a2d12dd8e..d34f7b2737 100644
--- a/usr/src/uts/common/vm/page_lock.c
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -189,16 +189,17 @@ uint_t page_lock_reclaim;
uint_t page_lock_bad_reclaim;
uint_t page_lock_same_page;
uint_t page_lock_upgrade;
+uint_t page_lock_retired;
uint_t page_lock_upgrade_failed;
uint_t page_lock_deleted;
uint_t page_trylock_locked;
+uint_t page_trylock_failed;
uint_t page_trylock_missed;
uint_t page_try_reclaim_upgrade;
#endif /* VM_STATS */
-
/*
* Acquire the "shared/exclusive" lock on a page.
*
@@ -222,27 +223,47 @@ page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
* callers wanting an exclusive (writer) lock may prevent shared-lock
* (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
* In this case, when an exclusive lock cannot be acquired, p_selock's
- * SE_EWANTED bit is set.
- * This bit, along with the se and es parameters, are used to decide
- * if the requested lock should be granted:
+ * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
+ * if the page is slated for retirement.
+ *
+ * The se and es parameters determine if the lock should be granted
+ * based on the following decision table:
+ *
+ * Lock wanted es flags p_selock/SE_EWANTED Action
+ * ----------- -------------- ------------------- ---------
+ * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED
+ * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED
+ * SE_EXCL none any lock/any deny
+ * SE_SHARED n/a [2][3] shared/0 grant
+ * SE_SHARED n/a [2][3] unlocked/0 grant
+ * SE_SHARED n/a shared/1 deny
+ * SE_SHARED n/a unlocked/1 deny
+ * SE_SHARED n/a excl/any deny
+ *
+ * Notes:
+ * [1] The code grants an exclusive lock to the caller and clears the bit
+ * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
+ * bit's value. This was deemed acceptable as we are not concerned about
+ * exclusive-lock starvation. If this ever becomes an issue, a priority or
+ * fifo mechanism should also be implemented. Meantime, the thread that
+ * set SE_EWANTED should be prepared to catch this condition and reset it
+ *
+ * [2] Retired pages may not be locked at any time, regardless of the
+ * dispostion of se, unless the es parameter has SE_RETIRED flag set.
*
- * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED Action
- * ---------- -------------- ------------------- ---------
- * SE_EXCL no dont-care/1 deny lock
- * SE_EXCL any(see note) unlocked/any grant lock, clear SE_EWANTED
- * SE_EXCL yes any lock/any deny, set SE_EWANTED
- * SE_EXCL no any lock/any deny
- * SE_SHARED not applicable shared/0 grant
- * SE_SHARED not applicable unlocked/0 grant
- * SE_SHARED not applicable shared/1 deny
- * SE_SHARED not applicable unlocked/1 deny
- * SE_SHARED not applicable excl/any deny
+ * [3] If the page is slated for retirement the lock is denied.
*
- * Note: the code grants an exclusive lock to the caller and clears
- * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
- * bit's value. This was deemed acceptable as we are not concerned about
- * exclusive-lock starvation. If this ever becomes an issue, a priority or
- * fifo mechanism should also be implemented.
+ * Notes on values of "es":
+ *
+ * es & 1: page_lookup_create will attempt page relocation
+ * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
+ * memory thread); this prevents reader-starvation of waiting
+ * writer thread(s) by giving priority to writers over readers.
+ * es & SE_RETIRED: caller wants to lock pages even if they are
+ * retired. Default is to deny the lock if the page is retired.
+ *
+ * And yes, we know, the semantics of this function are too complicated.
+ * It's on the list to be cleaned up.
*/
int
page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
@@ -261,17 +282,14 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
mutex_enter(pse);
- /*
- * Current uses of 'es':
- * es == 1 page_lookup_create will attempt page relocation
- * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete
- * memory thread); this prevents reader-starvation of waiting
- * writer thread(s).
- */
-
-
ASSERT(((es & SE_EXCL_WANTED) == 0) ||
- ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+ ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+ if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
+ mutex_exit(pse);
+ VM_STAT_ADD(page_lock_retired);
+ return (0);
+ }
if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
se = SE_EXCL;
@@ -312,7 +330,7 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
}
if (se == SE_EXCL) {
- if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
+ if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
/*
* if the caller wants a writer lock (but did not
* specify exclusive access), and there is a pending
@@ -327,7 +345,7 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
retval = 1;
} else {
/* page is locked */
- if (es == SE_EXCL_WANTED) {
+ if (es & SE_EXCL_WANTED) {
/* set the SE_EWANTED bit */
pp->p_selock |= SE_EWANTED;
}
@@ -336,10 +354,17 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
} else {
retval = 0;
if (pp->p_selock >= 0) {
- /* readers are not allowed when excl wanted */
- if (!(pp->p_selock & SE_EWANTED)) {
- pp->p_selock += SE_READER;
- retval = 1;
+ /*
+ * Readers are not allowed when excl wanted or
+ * a retire is pending. Since kvp pages can take
+ * a long time to be retired, we make an exception
+ * for them to avoid hanging threads unnecessarily.
+ */
+ if ((pp->p_selock & SE_EWANTED) == 0) {
+ if (!PP_PR_REQ(pp) || pp->p_vnode == &kvp) {
+ pp->p_selock += SE_READER;
+ retval = 1;
+ }
}
}
}
@@ -468,7 +493,13 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
old = pp->p_selock;
ASSERT(((es & SE_EXCL_WANTED) == 0) ||
- ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+ ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+ if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
+ mutex_exit(pse);
+ VM_STAT_ADD(page_trylock_failed);
+ return (0);
+ }
if (se == SE_SHARED && es == 1 && old == 0) {
se = SE_EXCL;
@@ -477,11 +508,20 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
if (se == SE_SHARED) {
if (!PP_ISFREE(pp)) {
if (old >= 0) {
- /* readers are not allowed when excl wanted */
- if (!(old & SE_EWANTED)) {
- pp->p_selock = old + SE_READER;
- mutex_exit(pse);
- return (1);
+ /*
+ * Readers are not allowed when excl wanted
+ * or a retire is pending. Since kvp pages can
+ * take a long time to be retired, we make an
+ * exception for them to avoid hanging threads
+ * unnecessarily.
+ */
+ if ((old & SE_EWANTED) == 0) {
+ if (!PP_PR_REQ(pp) ||
+ pp->p_vnode == &kvp) {
+ pp->p_selock = old + SE_READER;
+ mutex_exit(pse);
+ return (1);
+ }
}
}
mutex_exit(pse);
@@ -498,7 +538,7 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
* SE_EWANTED is not set, or if the caller specified
* SE_EXCL_WANTED.
*/
- if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) {
+ if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
if ((old & ~SE_EWANTED) == 0) {
/* no reader/writer lock held */
THREAD_KPRI_REQUEST();
@@ -508,7 +548,7 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
return (1);
}
}
- if (es == SE_EXCL_WANTED) {
+ if (es & SE_EXCL_WANTED) {
/* page is locked, set the SE_EWANTED bit */
pp->p_selock |= SE_EWANTED;
}
@@ -526,9 +566,15 @@ page_trylock(page_t *pp, se_t se)
kmutex_t *pse = PAGE_SE_MUTEX(pp);
mutex_enter(pse);
- if (pp->p_selock & SE_EWANTED) {
- /* fail if a thread wants exclusive access */
+ if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
+ (se == SE_SHARED && PP_PR_REQ(pp) && pp->p_vnode != &kvp)) {
+ /*
+ * Fail if a thread wants exclusive access and page is
+ * retired, if the page is slated for retirement, or a
+ * share lock is requested.
+ */
mutex_exit(pse);
+ VM_STAT_ADD(page_trylock_failed);
return (0);
}
@@ -551,6 +597,41 @@ page_trylock(page_t *pp, se_t se)
}
/*
+ * Variant of page_unlock() specifically for the page freelist
+ * code. The mere existence of this code is a vile hack that
+ * has resulted due to the backwards locking order of the page
+ * freelist manager; please don't call it.
+ */
+void
+page_unlock_noretire(page_t *pp)
+{
+ kmutex_t *pse = PAGE_SE_MUTEX(pp);
+ selock_t old;
+
+ mutex_enter(pse);
+
+ old = pp->p_selock;
+ if ((old & ~SE_EWANTED) == SE_READER) {
+ pp->p_selock = old & ~SE_READER;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ } else if ((old & ~SE_EWANTED) == SE_DELETED) {
+ panic("page_unlock_noretire: page %p is deleted", pp);
+ } else if (old < 0) {
+ THREAD_KPRI_RELEASE();
+ pp->p_selock &= SE_EWANTED;
+ if (CV_HAS_WAITERS(&pp->p_cv))
+ cv_broadcast(&pp->p_cv);
+ } else if ((old & ~SE_EWANTED) > SE_READER) {
+ pp->p_selock = old - SE_READER;
+ } else {
+ panic("page_unlock_noretire: page %p is not locked", pp);
+ }
+
+ mutex_exit(pse);
+}
+
+/*
* Release the page's "shared/exclusive" lock and wake up anyone
* who might be waiting for it.
*/
@@ -561,6 +642,7 @@ page_unlock(page_t *pp)
selock_t old;
mutex_enter(pse);
+
old = pp->p_selock;
if ((old & ~SE_EWANTED) == SE_READER) {
pp->p_selock = old & ~SE_READER;
@@ -578,7 +660,29 @@ page_unlock(page_t *pp)
} else {
panic("page_unlock: page %p is not locked", pp);
}
- mutex_exit(pse);
+
+ if (pp->p_selock == 0 && PP_PR_REQ(pp)) {
+ /*
+ * Try to retire the page. If it retires, great.
+ * If not, oh well, we'll get it in the next unlock
+ * request, and repeat the cycle. Regardless,
+ * page_tryretire() will drop the page lock.
+ */
+ if ((pp->p_toxic & PR_BUSY) == 0) {
+ THREAD_KPRI_REQUEST();
+ pp->p_selock = SE_WRITER;
+ page_settoxic(pp, PR_BUSY);
+ mutex_exit(pse);
+ page_tryretire(pp);
+ } else {
+ pp->p_selock = SE_WRITER;
+ page_clrtoxic(pp, PR_BUSY);
+ pp->p_selock = 0;
+ mutex_exit(pse);
+ }
+ } else {
+ mutex_exit(pse);
+ }
}
/*
diff --git a/usr/src/uts/common/vm/page_retire.c b/usr/src/uts/common/vm/page_retire.c
new file mode 100644
index 0000000000..30b218c15d
--- /dev/null
+++ b/usr/src/uts/common/vm/page_retire.c
@@ -0,0 +1,1473 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Page Retire - Big Theory Statement.
+ *
+ * This file handles removing sections of faulty memory from use when the
+ * user land FMA Diagnosis Engine requests that a page be removed or when
+ * a CE or UE is detected by the hardware.
+ *
+ * In the bad old days, the kernel side of Page Retire did a lot of the work
+ * on its own. Now, with the DE keeping track of errors, the kernel side is
+ * rather simple minded on most platforms.
+ *
+ * Errors are all reflected to the DE, and after digesting the error and
+ * looking at all previously reported errors, the DE decides what should
+ * be done about the current error. If the DE wants a particular page to
+ * be retired, then the kernel page retire code is invoked via an ioctl.
+ * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling
+ * page retire to handle the error. Since page retire is just a simple
+ * mechanism it doesn't need to differentiate between the different callers.
+ *
+ * The p_toxic field in the page_t is used to indicate which errors have
+ * occurred and what action has been taken on a given page. Because errors are
+ * reported without regard to the locked state of a page, no locks are used
+ * to SET the error bits in p_toxic. However, in order to clear the error
+ * bits, the page_t must be held exclusively locked.
+ *
+ * When page_retire() is called, it must be able to acquire locks, sleep, etc.
+ * It must not be called from high-level interrupt context.
+ *
+ * Depending on how the requested page is being used at the time of the retire
+ * request (and on the availability of sufficient system resources), the page
+ * may be retired immediately, or just marked for retirement later. For
+ * example, locked pages are marked, while free pages are retired. Multiple
+ * requests may be made to retire the same page, although there is no need
+ * to: once the p_toxic flags are set, the page will be retired as soon as it
+ * can be exclusively locked.
+ *
+ * The retire mechanism is driven centrally out of page_unlock(). To expedite
+ * the retirement of pages, further requests for SE_SHARED locks are denied
+ * as long as a page retirement is pending. In addition, as long as pages are
+ * pending retirement a background thread runs periodically trying to retire
+ * those pages. Pages which could not be retired while the system is running
+ * are scrubbed prior to rebooting to avoid latent errors on the next boot.
+ *
+ * Single CE pages and UE pages without persistent errors are scrubbed and
+ * returned to service. Recidivist pages, as well as FMA-directed requests
+ * for retirement, result in the page being taken out of service. Once the
+ * decision is made to take a page out of service, the page is cleared, hashed
+ * onto the retired_pages vnode, marked as retired, and it is unlocked. No
+ * other requesters (except for unretire) are allowed to lock retired pages.
+ *
+ * The public routines return (sadly) 0 if they worked and a non-zero error
+ * value if something went wrong. This is done for the ioctl side of the
+ * world to allow errors to be reflected all the way out to user land. The
+ * non-zero values are explained in comments atop each function.
+ */
+
+/*
+ * Things to fix:
+ *
+ * 1. Cleanup SE_EWANTED. Since we're aggressive about trying to retire
+ * pages, we can use page_retire_pp() to replace SE_EWANTED and all
+ * the special delete_memory_thread() code just goes away.
+ *
+ * 2. Trying to retire non-relocatable kvp pages may result in a
+ * quagmire. This is because seg_kmem() no longer keeps its pages locked,
+ * and calls page_lookup() in the free path; since kvp pages are modified
+ * and don't have a usable backing store, page_retire() can't do anything
+ * with them, and we'll keep denying the lock to seg_kmem_free() in a
+ * vicious cycle. To prevent that, we don't deny locks to kvp pages, and
+ * hence only call page_retire_pp() from page_unlock() in the free path.
+ * Since most kernel pages are indefinitely held anyway, and don't
+ * participate in I/O, this is of little consequence.
+ *
+ * 3. Low memory situations will be interesting. If we don't have
+ * enough memory for page_relocate() to succeed, we won't be able to
+ * retire dirty pages; nobody will be able to push them out to disk
+ * either, since we aggressively deny the page lock. We could change
+ * fsflush so it can recognize this situation, grab the lock, and push
+ * the page out, where we'll catch it in the free path and retire it.
+ *
+ * 4. Beware of places that have code like this in them:
+ *
+ * if (! page_tryupgrade(pp)) {
+ * page_unlock(pp);
+ * while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) {
+ * / *NOTHING* /
+ * }
+ * }
+ * page_free(pp);
+ *
+ * The problem is that pp can change identity right after the
+ * page_unlock() call. In particular, page_retire() can step in
+ * there, change pp's identity, and hash pp onto the retired_vnode.
+ *
+ * Of course, other functions besides page_retire() can have the
+ * same effect. A kmem reader can waltz by, set up a mapping to the
+ * page, and then unlock the page. Page_free() will then go castors
+ * up. So if anybody is doing this, it's already a bug.
+ *
+ * 5. mdboot()'s call into page_retire_hunt() should probably be
+ * moved lower. Where the call is made now, we can get into trouble
+ * by scrubbing a kernel page that is then accessed later.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/ksynch.h>
+#include <sys/thread.h>
+#include <sys/disp.h>
+#include <sys/ontrap.h>
+#include <sys/vmsystm.h>
+#include <sys/mem_config.h>
+#include <sys/atomic.h>
+#include <sys/callb.h>
+#include <vm/page.h>
+#include <vm/vm_dep.h>
+#include <vm/as.h>
+#include <vm/hat.h>
+
+/*
+ * vnode for all pages which are retired from the VM system;
+ */
+vnode_t *retired_pages;
+
+/*
+ * Background thread that wakes up periodically to try to retire pending
+ * pages. This prevents threads from becoming blocked indefinitely in
+ * page_lookup() or some other routine should the page(s) they are waiting
+ * on become eligible for social security.
+ */
+static void page_retire_thread(void);
+static kthread_t *pr_thread_id;
+static kcondvar_t pr_cv;
+static kmutex_t pr_thread_mutex;
+static clock_t pr_thread_shortwait;
+static clock_t pr_thread_longwait;
+
+/*
+ * Make a list of all of the pages that have been marked for retirement
+ * but are not yet retired. At system shutdown, we will scrub all of the
+ * pages in the list in case there are outstanding UEs. Then, we
+ * cross-check this list against the number of pages that are yet to be
+ * retired, and if we find inconsistencies, we scan every page_t in the
+ * whole system looking for any pages that need to be scrubbed for UEs.
+ * The background thread also uses this queue to determine which pages
+ * it should keep trying to retire.
+ */
+#ifdef DEBUG
+#define PR_PENDING_QMAX 32
+#else /* DEBUG */
+#define PR_PENDING_QMAX 256
+#endif /* DEBUG */
+page_t *pr_pending_q[PR_PENDING_QMAX];
+kmutex_t pr_q_mutex;
+
+/*
+ * Page retire global kstats
+ */
+struct page_retire_kstat {
+ kstat_named_t pr_retired;
+ kstat_named_t pr_requested;
+ kstat_named_t pr_requested_free;
+ kstat_named_t pr_enqueue_fail;
+ kstat_named_t pr_dequeue_fail;
+ kstat_named_t pr_pending;
+ kstat_named_t pr_failed;
+ kstat_named_t pr_failed_kernel;
+ kstat_named_t pr_limit;
+ kstat_named_t pr_limit_exceeded;
+ kstat_named_t pr_fma;
+ kstat_named_t pr_mce;
+ kstat_named_t pr_ue;
+ kstat_named_t pr_ue_cleared_retire;
+ kstat_named_t pr_ue_cleared_free;
+ kstat_named_t pr_ue_persistent;
+ kstat_named_t pr_unretired;
+};
+
+static struct page_retire_kstat page_retire_kstat = {
+ { "pages_retired", KSTAT_DATA_UINT64},
+ { "pages_retire_request", KSTAT_DATA_UINT64},
+ { "pages_retire_request_free", KSTAT_DATA_UINT64},
+ { "pages_notenqueued", KSTAT_DATA_UINT64},
+ { "pages_notdequeued", KSTAT_DATA_UINT64},
+ { "pages_pending", KSTAT_DATA_UINT64},
+ { "pages_deferred", KSTAT_DATA_UINT64},
+ { "pages_deferred_kernel", KSTAT_DATA_UINT64},
+ { "pages_limit", KSTAT_DATA_UINT64},
+ { "pages_limit_exceeded", KSTAT_DATA_UINT64},
+ { "pages_fma", KSTAT_DATA_UINT64},
+ { "pages_multiple_ce", KSTAT_DATA_UINT64},
+ { "pages_ue", KSTAT_DATA_UINT64},
+ { "pages_ue_cleared_retired", KSTAT_DATA_UINT64},
+ { "pages_ue_cleared_freed", KSTAT_DATA_UINT64},
+ { "pages_ue_persistent", KSTAT_DATA_UINT64},
+ { "pages_unretired", KSTAT_DATA_UINT64},
+};
+
+static kstat_t *page_retire_ksp = NULL;
+
+#define PR_INCR_KSTAT(stat) \
+ atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1)
+#define PR_DECR_KSTAT(stat) \
+ atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1)
+
+#define PR_KSTAT_RETIRED_CE (page_retire_kstat.pr_mce.value.ui64)
+#define PR_KSTAT_RETIRED_FMA (page_retire_kstat.pr_fma.value.ui64)
+#define PR_KSTAT_RETIRED_NOTUE (PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA)
+#define PR_KSTAT_PENDING (page_retire_kstat.pr_pending.value.ui64)
+#define PR_KSTAT_EQFAIL (page_retire_kstat.pr_enqueue_fail.value.ui64)
+#define PR_KSTAT_DQFAIL (page_retire_kstat.pr_dequeue_fail.value.ui64)
+
+/*
+ * Limit the number of multiple CE page retires.
+ * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in
+ * basis points, where 100 basis points equals one percent.
+ */
+#define MCE_BPT 10
+uint64_t max_pages_retired_bps = MCE_BPT;
+#define PAGE_RETIRE_LIMIT ((physmem * max_pages_retired_bps) / 10000)
+
+/*
+ * Control over the verbosity of page retirement.
+ *
+ * When set to zero (the default), no messages will be printed.
+ * When set to one, summary messages will be printed.
+ * When set > one, all messages will be printed.
+ *
+ * A value of one will trigger detailed messages for retirement operations,
+ * and is intended as a platform tunable for processors where FMA's DE does
+ * not run (e.g., spitfire). Values > one are intended for debugging only.
+ */
+int page_retire_messages = 0;
+
+/*
+ * Control whether or not we retire dirty UE pages. By default we do
+ * since we assume the data is corrupt and the process(es) using it will
+ * be killed. This is platform tunable only, and should probably not be
+ * changed, ever.
+ */
+int page_retire_modified = 1;
+
+/*
+ * Control whether or not we return scrubbed UE pages to service.
+ * By default we do not since FMA wants to run its diagnostics first
+ * and then ask us to unretire the page if it passes. Non-FMA platforms
+ * may set this to zero so we will only retire recidivist pages. It should
+ * not be changed by the user.
+ */
+int page_retire_first_ue = 1;
+
+/*
+ * Master enable for page retire. This prevents a CE or UE early in boot
+ * from trying to retire a page before page_retire_init() has finished
+ * setting things up. This is internal only and is not a tunable!
+ */
+static int pr_enable = 0;
+
+extern struct vnode kvp;
+
+#ifdef DEBUG
+struct page_retire_debug {
+ int prd_dup;
+ int prd_noaction;
+ int prd_queued;
+ int prd_notqueued;
+ int prd_dequeue;
+ int prd_top;
+ int prd_locked;
+ int prd_reloc;
+ int prd_modce;
+ int prd_modue_fail;
+ int prd_modue_retire;
+ int prd_kern;
+ int prd_free;
+ int prd_noreclaim;
+ int prd_hashout;
+ int prd_fma;
+ int prd_uescrubbed;
+ int prd_uenotscrubbed;
+ int prd_mce;
+ int prd_prlocked;
+ int prd_prnotlocked;
+ int prd_prretired;
+ int prd_ulocked;
+ int prd_unotretired;
+ int prd_udestroy;
+ int prd_uhashout;
+ int prd_uunretired;
+ int prd_unotlocked;
+ int prd_checkhit;
+ int prd_checkmiss;
+ int prd_tctop;
+ int prd_tclocked;
+ int prd_hunt;
+ int prd_dohunt;
+ int prd_earlyhunt;
+ int prd_latehunt;
+ int prd_nofreedemote;
+ int prd_nodemote;
+ int prd_demoted;
+} pr_debug;
+
+#define PR_DEBUG(foo) ((pr_debug.foo)++)
+
+/*
+ * A type histogram. We record the incidence of the various toxic
+ * flag combinations along with the interesting page attributes. The
+ * goal is to get as many combinations as we can while driving all
+ * pr_debug values nonzero (indicating we've exercised all possible
+ * code paths across all possible page types). Not all combinations
+ * will make sense -- e.g. PRT_MOD|PRT_KERNEL.
+ *
+ * pr_type offset bit encoding (when examining with a debugger):
+ *
+ * PRT_NAMED - 0x4
+ * PRT_KERNEL - 0x8
+ * PRT_FREE - 0x10
+ * PRT_MOD - 0x20
+ * PRT_FMA - 0x0
+ * PRT_MCE - 0x40
+ * PRT_UE - 0x80
+ */
+
+#define PRT_NAMED 0x01
+#define PRT_KERNEL 0x02
+#define PRT_FREE 0x04
+#define PRT_MOD 0x08
+#define PRT_FMA 0x00 /* yes, this is not a mistake */
+#define PRT_MCE 0x10
+#define PRT_UE 0x20
+#define PRT_ALL 0x3F
+
+int pr_types[PRT_ALL+1];
+
+#define PR_TYPES(pp) { \
+ int whichtype = 0; \
+ if (pp->p_vnode) \
+ whichtype |= PRT_NAMED; \
+ if (pp->p_vnode == &kvp) \
+ whichtype |= PRT_KERNEL; \
+ if (PP_ISFREE(pp)) \
+ whichtype |= PRT_FREE; \
+ if (hat_ismod(pp)) \
+ whichtype |= PRT_MOD; \
+ if (pp->p_toxic & PR_UE) \
+ whichtype |= PRT_UE; \
+ if (pp->p_toxic & PR_MCE) \
+ whichtype |= PRT_MCE; \
+ pr_types[whichtype]++; \
+}
+
+int recl_calls;
+int recl_mtbf = 3;
+int reloc_calls;
+int reloc_mtbf = 7;
+int pr_calls;
+int pr_mtbf = 15;
+
+#define MTBF(v, f) (((++(v)) & (f)) != (f))
+
+#else /* DEBUG */
+
+#define PR_DEBUG(foo) /* nothing */
+#define PR_TYPES(foo) /* nothing */
+#define MTBF(v, f) (1)
+
+#endif /* DEBUG */
+
+/*
+ * page_retire_done() - completion processing
+ *
+ * Used by the page_retire code for common completion processing.
+ * It keeps track of how many times a given result has happened,
+ * and writes out an occasional message.
+ *
+ * May be called with a NULL pp (PRD_INVALID_PA case).
+ */
+#define PRD_INVALID_KEY -1
+#define PRD_SUCCESS 0
+#define PRD_PENDING 1
+#define PRD_FAILED 2
+#define PRD_DUPLICATE 3
+#define PRD_INVALID_PA 4
+#define PRD_LIMIT 5
+#define PRD_UE_SCRUBBED 6
+#define PRD_UNR_SUCCESS 7
+#define PRD_UNR_CANTLOCK 8
+#define PRD_UNR_NOT 9
+
+typedef struct page_retire_op {
+ int pr_key; /* one of the PRD_* defines from above */
+ int pr_count; /* How many times this has happened */
+ int pr_retval; /* return value */
+ int pr_msglvl; /* message level - when to print */
+ char *pr_message; /* Cryptic message for field service */
+} page_retire_op_t;
+
+static page_retire_op_t page_retire_ops[] = {
+ /* key count retval msglvl message */
+ {PRD_SUCCESS, 0, 0, 1,
+ "Page 0x%08x.%08x removed from service"},
+ {PRD_PENDING, 0, EAGAIN, 2,
+ "Page 0x%08x.%08x will be retired on free"},
+ {PRD_FAILED, 0, EAGAIN, 0, NULL},
+ {PRD_DUPLICATE, 0, EBUSY, 2,
+ "Page 0x%08x.%08x already retired"},
+ {PRD_INVALID_PA, 0, EINVAL, 2,
+ "PA 0x%08x.%08x is not a relocatable page"},
+ {PRD_LIMIT, 0, 0, 1,
+ "Page 0x%08x.%08x not retired due to limit exceeded"},
+ {PRD_UE_SCRUBBED, 0, 0, 1,
+ "Previously reported error on page 0x%08x.%08x cleared"},
+ {PRD_UNR_SUCCESS, 0, 0, 1,
+ "Page 0x%08x.%08x returned to service"},
+ {PRD_UNR_CANTLOCK, 0, EAGAIN, 2,
+ "Page 0x%08x.%08x could not be unretired"},
+ {PRD_UNR_NOT, 0, EBADF, 2,
+ "Page 0x%08x.%08x is not retired"},
+ {PRD_INVALID_KEY, 0, 0, 0, NULL} /* MUST BE LAST! */
+};
+
+/*
+ * print a message if page_retire_messages is true.
+ */
+#define PR_MESSAGE(debuglvl, msglvl, msg, pa) \
+{ \
+ uint64_t p = (uint64_t)pa; \
+ if (page_retire_messages >= msglvl && msg != NULL) { \
+ cmn_err(debuglvl, msg, \
+ (uint32_t)(p >> 32), (uint32_t)p); \
+ } \
+}
+
+/*
+ * Note that multiple bits may be set in a single settoxic operation.
+ * May be called without the page locked.
+ */
+void
+page_settoxic(page_t *pp, uchar_t bits)
+{
+ atomic_or_8(&pp->p_toxic, bits);
+}
+
+/*
+ * Note that multiple bits may cleared in a single clrtoxic operation.
+ * Must be called with the page exclusively locked.
+ */
+void
+page_clrtoxic(page_t *pp, uchar_t bits)
+{
+ ASSERT(PAGE_EXCL(pp));
+ atomic_and_8(&pp->p_toxic, ~bits);
+}
+
+/*
+ * Prints any page retire messages to the user, and decides what
+ * error code is appropriate for the condition reported.
+ */
+static int
+page_retire_done(page_t *pp, int code)
+{
+ page_retire_op_t *prop;
+ uint64_t pa = 0;
+ int i;
+
+ if (pp != NULL) {
+ pa = mmu_ptob(pp->p_pagenum);
+ }
+
+ prop = NULL;
+ for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) {
+ if (page_retire_ops[i].pr_key == code) {
+ prop = &page_retire_ops[i];
+ break;
+ }
+ }
+
+#ifdef DEBUG
+ if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) {
+ cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code);
+ }
+#endif
+
+ ASSERT(prop->pr_key == code);
+
+ prop->pr_count++;
+
+ PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa);
+ if (pp != NULL) {
+ page_settoxic(pp, PR_MSG);
+ }
+
+ return (prop->pr_retval);
+}
+
+/*
+ * On a reboot, our friend mdboot() wants to clear up any PP_PR_REQ() pages
+ * that we were not able to retire. On large machines, walking the complete
+ * page_t array and looking at every page_t takes too long. So, as a page is
+ * marked toxic, we track it using a list that can be processed at reboot
+ * time. page_retire_enqueue() will do its best to try to avoid duplicate
+ * entries, but if we get too many errors at once the queue can overflow,
+ * in which case we will end up walking every page_t as a last resort.
+ * The background thread also makes use of this queue to find which pages
+ * are pending retirement.
+ */
+static void
+page_retire_enqueue(page_t *pp)
+{
+ int nslot = -1;
+ int i;
+
+ mutex_enter(&pr_q_mutex);
+
+ /*
+ * Check to make sure retire hasn't already dequeued it.
+ * In the meantime if the page was cleaned up, no need
+ * to enqueue it.
+ */
+ if (PP_RETIRED(pp) || pp->p_toxic == 0) {
+ mutex_exit(&pr_q_mutex);
+ PR_DEBUG(prd_noaction);
+ return;
+ }
+
+ for (i = 0; i < PR_PENDING_QMAX; i++) {
+ if (pr_pending_q[i] == pp) {
+ mutex_exit(&pr_q_mutex);
+ PR_DEBUG(prd_dup);
+ return;
+ } else if (nslot == -1 && pr_pending_q[i] == NULL) {
+ nslot = i;
+ }
+ }
+
+ PR_INCR_KSTAT(pr_pending);
+
+ if (nslot != -1) {
+ pr_pending_q[nslot] = pp;
+ PR_DEBUG(prd_queued);
+ } else {
+ PR_INCR_KSTAT(pr_enqueue_fail);
+ PR_DEBUG(prd_notqueued);
+ }
+ mutex_exit(&pr_q_mutex);
+}
+
+static void
+page_retire_dequeue(page_t *pp)
+{
+ int i;
+
+ mutex_enter(&pr_q_mutex);
+
+ for (i = 0; i < PR_PENDING_QMAX; i++) {
+ if (pr_pending_q[i] == pp) {
+ pr_pending_q[i] = NULL;
+ break;
+ }
+ }
+
+ if (i == PR_PENDING_QMAX) {
+ PR_INCR_KSTAT(pr_dequeue_fail);
+ }
+
+ PR_DECR_KSTAT(pr_pending);
+ PR_DEBUG(prd_dequeue);
+
+ mutex_exit(&pr_q_mutex);
+}
+
+/*
+ * Act like page_destroy(), but instead of freeing the page, hash it onto
+ * the retired_pages vnode, and mark it retired.
+ *
+ * For fun, we try to scrub the page until it's squeaky clean.
+ * availrmem is adjusted here.
+ */
+static void
+page_retire_destroy(page_t *pp)
+{
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(!PP_ISFREE(pp));
+ ASSERT(pp->p_szc == 0);
+ ASSERT(!hat_page_is_mapped(pp));
+ ASSERT(!pp->p_vnode);
+
+ page_clr_all_props(pp);
+ pagescrub(pp, 0, MMU_PAGESIZE);
+
+ pp->p_next = NULL;
+ pp->p_prev = NULL;
+ if (page_hashin(pp, retired_pages, (u_offset_t)pp, NULL) == 0) {
+ cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp);
+ }
+
+ page_settoxic(pp, PR_RETIRED);
+ page_clrtoxic(pp, PR_BUSY);
+ page_retire_dequeue(pp);
+ PR_INCR_KSTAT(pr_retired);
+
+ if (pp->p_toxic & PR_FMA) {
+ PR_INCR_KSTAT(pr_fma);
+ } else if (pp->p_toxic & PR_UE) {
+ PR_INCR_KSTAT(pr_ue);
+ } else {
+ PR_INCR_KSTAT(pr_mce);
+ }
+
+ mutex_enter(&freemem_lock);
+ availrmem--;
+ mutex_exit(&freemem_lock);
+
+ page_unlock(pp);
+}
+
+/*
+ * Check whether the number of pages which have been retired already exceeds
+ * the maximum allowable percentage of memory which may be retired.
+ *
+ * Returns 1 if the limit has been exceeded.
+ */
+static int
+page_retire_limit(void)
+{
+ if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) {
+ PR_INCR_KSTAT(pr_limit_exceeded);
+ return (1);
+ }
+
+ return (0);
+}
+
+#define MSG_DM "Data Mismatch occurred at PA 0x%08x.%08x" \
+ "[ 0x%x != 0x%x ] while attempting to clear previously " \
+ "reported error; page removed from service"
+
+#define MSG_UE "Uncorrectable Error occurred at PA 0x%08x.%08x while " \
+ "attempting to clear previously reported error; page removed " \
+ "from service"
+
+/*
+ * Attempt to clear a UE from a page.
+ * Returns 1 if the error has been successfully cleared.
+ */
+static int
+page_clear_transient_ue(page_t *pp)
+{
+ caddr_t kaddr;
+ uint8_t rb, wb;
+ uint64_t pa;
+ uint32_t pa_hi, pa_lo;
+ on_trap_data_t otd;
+ int errors = 0;
+ int i;
+
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(PP_PR_REQ(pp));
+ ASSERT(pp->p_szc == 0);
+ ASSERT(!hat_page_is_mapped(pp));
+
+ /*
+ * Clear the page and attempt to clear the UE. If we trap
+ * on the next access to the page, we know the UE has recurred.
+ */
+ pagescrub(pp, 0, PAGESIZE);
+
+ /*
+ * Map the page and write a bunch of bit patterns to compare
+ * what we wrote with what we read back. This isn't a perfect
+ * test but it should be good enough to catch most of the
+ * recurring UEs. If this fails to catch a recurrent UE, we'll
+ * retire the page the next time we see a UE on the page.
+ */
+ kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1);
+
+ pa = ptob((uint64_t)page_pptonum(pp));
+ pa_hi = (uint32_t)(pa >> 32);
+ pa_lo = (uint32_t)pa;
+
+ /*
+ * Fill the page with each (0x00 - 0xFF] bit pattern, flushing
+ * the cache in between reading and writing. We do this under
+ * on_trap() protection to avoid recursion.
+ */
+ if (on_trap(&otd, OT_DATA_EC)) {
+ PR_MESSAGE(CE_WARN, 1, MSG_UE, pa);
+ errors = 1;
+ } else {
+ for (wb = 0xff; wb > 0; wb--) {
+ for (i = 0; i < PAGESIZE; i++) {
+ kaddr[i] = wb;
+ }
+
+ sync_data_memory(kaddr, PAGESIZE);
+
+ for (i = 0; i < PAGESIZE; i++) {
+ rb = kaddr[i];
+ if (rb != wb) {
+ /*
+ * We had a mismatch without a trap.
+ * Uh-oh. Something is really wrong
+ * with this system.
+ */
+ if (page_retire_messages) {
+ cmn_err(CE_WARN, MSG_DM,
+ pa_hi, pa_lo, rb, wb);
+ }
+ errors = 1;
+ goto out; /* double break */
+ }
+ }
+ }
+ }
+out:
+ no_trap();
+ ppmapout(kaddr);
+
+ return (errors ? 0 : 1);
+}
+
+/*
+ * Try to clear a page_t with a single UE. If the UE was transient, it is
+ * returned to service, and we return 1. Otherwise we return 0 meaning
+ * that further processing is required to retire the page.
+ */
+static int
+page_retire_transient_ue(page_t *pp)
+{
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(!hat_page_is_mapped(pp));
+
+ /*
+ * If this page is a repeat offender, retire him under the
+ * "two strikes and you're out" rule. The caller is responsible
+ * for scrubbing the page to try to clear the error.
+ */
+ if (pp->p_toxic & PR_UE_SCRUBBED) {
+ PR_INCR_KSTAT(pr_ue_persistent);
+ return (0);
+ }
+
+ if (page_clear_transient_ue(pp)) {
+ /*
+ * We set the PR_SCRUBBED_UE bit; if we ever see this
+ * page again, we will retire it, no questions asked.
+ */
+ page_settoxic(pp, PR_UE_SCRUBBED);
+
+ if (page_retire_first_ue) {
+ PR_INCR_KSTAT(pr_ue_cleared_retire);
+ return (0);
+ } else {
+ PR_INCR_KSTAT(pr_ue_cleared_free);
+
+ page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY);
+ page_retire_dequeue(pp);
+
+ /*
+ * Clear the free bit if it's set, since the
+ * page free code will get cranky if we don't.
+ */
+ PP_CLRFREE(pp);
+
+ /* LINTED: CONSTCOND */
+ VN_DISPOSE(pp, B_FREE, 1, kcred);
+ return (1);
+ }
+ }
+
+ PR_INCR_KSTAT(pr_ue_persistent);
+ return (0);
+}
+
+/*
+ * Update the statistics dynamically when our kstat is read.
+ */
+static int
+page_retire_kstat_update(kstat_t *ksp, int rw)
+{
+ struct page_retire_kstat *pr;
+
+ if (ksp == NULL)
+ return (EINVAL);
+
+ switch (rw) {
+
+ case KSTAT_READ:
+ pr = (struct page_retire_kstat *)ksp->ks_data;
+ ASSERT(pr == &page_retire_kstat);
+ pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT;
+ return (0);
+
+ case KSTAT_WRITE:
+ return (EACCES);
+
+ default:
+ return (EINVAL);
+ }
+ /*NOTREACHED*/
+}
+
+/*
+ * Initialize the page retire mechanism:
+ *
+ * - Establish the correctable error retire limit.
+ * - Initialize locks.
+ * - Build the retired_pages vnode.
+ * - Set up the kstats.
+ * - Fire off the background thread.
+ * - Tell page_tryretire() it's OK to start retiring pages.
+ */
+void
+page_retire_init(void)
+{
+ const fs_operation_def_t retired_vnodeops_template[] = {NULL, NULL};
+ struct vnodeops *vops;
+
+ const uint_t page_retire_ndata =
+ sizeof (page_retire_kstat) / sizeof (kstat_named_t);
+
+ ASSERT(page_retire_ksp == NULL);
+
+ if (max_pages_retired_bps <= 0) {
+ max_pages_retired_bps = MCE_BPT;
+ }
+
+ mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL);
+
+ retired_pages = vn_alloc(KM_SLEEP);
+ if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) {
+ cmn_err(CE_PANIC,
+ "page_retired_init: can't make retired vnodeops");
+ }
+ vn_setops(retired_pages, vops);
+
+ if ((page_retire_ksp = kstat_create("unix", 0, "page_retire",
+ "misc", KSTAT_TYPE_NAMED, page_retire_ndata,
+ KSTAT_FLAG_VIRTUAL)) == NULL) {
+ cmn_err(CE_WARN, "kstat_create for page_retire failed");
+ } else {
+ page_retire_ksp->ks_data = (void *)&page_retire_kstat;
+ page_retire_ksp->ks_update = page_retire_kstat_update;
+ kstat_install(page_retire_ksp);
+ }
+
+ pr_thread_shortwait = 23 * hz;
+ pr_thread_longwait = 1201 * hz;
+ mutex_init(&pr_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&pr_cv, NULL, CV_DEFAULT, NULL);
+ pr_thread_id = thread_create(NULL, 0, page_retire_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+
+ pr_enable = 1;
+}
+
+/*
+ * page_retire_hunt() callback for the retire thread.
+ */
+static void
+page_retire_thread_cb(page_t *pp)
+{
+ PR_DEBUG(prd_tctop);
+ if (pp->p_vnode != &kvp && page_trylock(pp, SE_EXCL)) {
+ PR_DEBUG(prd_tclocked);
+ page_unlock(pp);
+ }
+}
+
+/*
+ * page_retire_hunt() callback for mdboot().
+ *
+ * It is necessary to scrub any failing pages prior to reboot in order to
+ * prevent a latent error trap from occurring on the next boot.
+ */
+void
+page_retire_mdboot_cb(page_t *pp)
+{
+ /*
+ * Don't scrub the kernel, since we might still need it, unless
+ * we have UEs on the page, in which case we have nothing to lose.
+ */
+ if (pp->p_vnode != &kvp || PP_TOXIC(pp)) {
+ pp->p_selock = -1; /* pacify ASSERTs */
+ pagescrub(pp, 0, PAGESIZE);
+ pp->p_selock = 0;
+ }
+ pp->p_toxic = 0;
+}
+
+/*
+ * Hunt down any pages in the system that have not yet been retired, invoking
+ * the provided callback function on each of them.
+ */
+void
+page_retire_hunt(void (*callback)(page_t *))
+{
+ page_t *pp;
+ page_t *first;
+ int i, found;
+
+ PR_DEBUG(prd_hunt);
+
+ if (PR_KSTAT_PENDING == 0) {
+ return;
+ }
+
+ PR_DEBUG(prd_dohunt);
+
+ found = 0;
+ mutex_enter(&pr_q_mutex);
+
+ for (i = 0; i < PR_PENDING_QMAX; i++) {
+ if ((pp = pr_pending_q[i]) != NULL) {
+ mutex_exit(&pr_q_mutex);
+ callback(pp);
+ mutex_enter(&pr_q_mutex);
+ found++;
+ }
+ }
+
+ if (PR_KSTAT_EQFAIL == PR_KSTAT_DQFAIL && found == PR_KSTAT_PENDING) {
+ mutex_exit(&pr_q_mutex);
+ PR_DEBUG(prd_earlyhunt);
+ return;
+ }
+ mutex_exit(&pr_q_mutex);
+
+ PR_DEBUG(prd_latehunt);
+
+ /*
+ * We've lost track of a page somewhere. Hunt it down.
+ */
+ memsegs_lock(0);
+ pp = first = page_first();
+ do {
+ if (PP_PR_REQ(pp)) {
+ callback(pp);
+ if (++found == PR_KSTAT_PENDING) {
+ break; /* got 'em all */
+ }
+ }
+ } while ((pp = page_next(pp)) != first);
+ memsegs_unlock(0);
+}
+
+/*
+ * The page_retire_thread loops forever, looking to see if there are
+ * pages still waiting to be retired.
+ */
+static void
+page_retire_thread(void)
+{
+ callb_cpr_t c;
+
+ CALLB_CPR_INIT(&c, &pr_thread_mutex, callb_generic_cpr, "page_retire");
+
+ mutex_enter(&pr_thread_mutex);
+ for (;;) {
+ if (pr_enable && PR_KSTAT_PENDING) {
+ kmem_reap();
+ seg_preap();
+ page_retire_hunt(page_retire_thread_cb);
+ CALLB_CPR_SAFE_BEGIN(&c);
+ (void) cv_timedwait(&pr_cv, &pr_thread_mutex,
+ lbolt + pr_thread_shortwait);
+ CALLB_CPR_SAFE_END(&c, &pr_thread_mutex);
+ } else {
+ CALLB_CPR_SAFE_BEGIN(&c);
+ (void) cv_timedwait(&pr_cv, &pr_thread_mutex,
+ lbolt + pr_thread_longwait);
+ CALLB_CPR_SAFE_END(&c, &pr_thread_mutex);
+ }
+ }
+ /*NOTREACHED*/
+}
+
+/*
+ * page_retire_pp() decides what to do with a failing page.
+ *
+ * When we get a free page (e.g. the scrubber or in the free path) life is
+ * nice because the page is clean and marked free -- those always retire
+ * nicely. From there we go by order of difficulty. If the page has data,
+ * we attempt to relocate its contents to a suitable replacement page. If
+ * that does not succeed, we look to see if it is clean. If after all of
+ * this we have a clean, unmapped page (which we usually do!), we retire it.
+ * If the page is not clean, we still process it regardless on a UE; for
+ * CEs or FMA requests, we fail leaving the page in service. The page will
+ * eventually be tried again later. We always return with the page unlocked
+ * since we are called from page_unlock().
+ *
+ * We don't call panic or do anything fancy down in here. Our boss the DE
+ * gets paid handsomely to do his job of figuring out what to do when errors
+ * occur. We just do what he tells us to do.
+ */
+static int
+page_retire_pp(page_t *pp)
+{
+ int toxic;
+
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_iolock_state == 0);
+ ASSERT(pp->p_szc == 0);
+
+ PR_DEBUG(prd_top);
+ PR_TYPES(pp);
+
+ toxic = pp->p_toxic;
+ ASSERT(toxic & PR_REASONS);
+
+ if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) &&
+ page_retire_limit()) {
+ page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY);
+ page_retire_dequeue(pp);
+ page_unlock(pp);
+ return (page_retire_done(pp, PRD_LIMIT));
+ }
+
+ if (PP_ISFREE(pp)) {
+ PR_DEBUG(prd_free);
+ if (!MTBF(recl_calls, recl_mtbf) || !page_reclaim(pp, NULL)) {
+ PR_DEBUG(prd_noreclaim);
+ PR_INCR_KSTAT(pr_failed);
+ page_unlock(pp);
+ return (page_retire_done(pp, PRD_FAILED));
+ }
+ }
+
+ if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISFREE(pp) &&
+ !PP_ISNORELOC(pp) && MTBF(reloc_calls, reloc_mtbf)) {
+ page_t *newpp;
+ spgcnt_t count;
+
+ /*
+ * If we can relocate the page, great! newpp will go
+ * on without us, and everything is fine. Regardless
+ * of whether the relocation succeeds, we are still
+ * going to take `pp' around back and shoot it.
+ */
+ PR_DEBUG(prd_reloc);
+ newpp = NULL;
+ if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) {
+ page_unlock(newpp);
+ ASSERT(hat_page_getattr(pp, P_MOD) == 0);
+ }
+ }
+
+ if (pp->p_vnode == &kvp) {
+ PR_DEBUG(prd_kern);
+ PR_INCR_KSTAT(pr_failed_kernel);
+ page_unlock(pp);
+ return (page_retire_done(pp, PRD_FAILED));
+ }
+
+ if (pp->p_lckcnt || pp->p_cowcnt) {
+ if (toxic & PR_UE) {
+ (void) page_clear_lck_cow(pp, 1);
+ } else {
+ PR_DEBUG(prd_locked);
+ PR_INCR_KSTAT(pr_failed);
+ page_unlock(pp);
+ return (page_retire_done(pp, PRD_FAILED));
+ }
+ }
+
+ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+ ASSERT(!PP_ISFREE(pp));
+ ASSERT(!hat_page_is_mapped(pp));
+
+ /*
+ * If the page is modified, was not relocated, and not toxic,
+ * we can't retire it without dropping data on the floor.
+ *
+ * RFE: we could change fsflush so that it (and only it) will
+ * be allowed to lock this page and push it out. Once it cleans
+ * the page, we'd then be able to retire it on the free path.
+ * In practice, this should be exceedingly rare.
+ */
+ if (hat_ismod(pp)) {
+ if ((toxic & PR_UE) == 0) {
+ PR_DEBUG(prd_modce);
+ PR_INCR_KSTAT(pr_failed);
+ page_unlock(pp);
+ return (page_retire_done(pp, PRD_FAILED));
+ } else if (page_retire_modified == 0) {
+ PR_DEBUG(prd_modue_fail);
+ PR_INCR_KSTAT(pr_failed);
+ page_unlock(pp);
+ return (page_retire_done(pp, PRD_FAILED));
+ }
+ PR_DEBUG(prd_modue_retire);
+ }
+
+ if (pp->p_vnode) {
+ PR_DEBUG(prd_hashout);
+ page_hashout(pp, NULL);
+ }
+ ASSERT(!pp->p_vnode);
+
+ /*
+ * The problem page is locked, demoted, unmapped, not free,
+ * hashed out, and not COW or mlocked (whew!).
+ *
+ * Now we select our ammunition, take it around back, and shoot it.
+ */
+ if (toxic & PR_UE) {
+ if (hat_ismod(pp)) {
+ /*
+ * Let the user know we are dropping their data
+ * on the floor.
+ */
+ PR_MESSAGE(CE_WARN, 1, "Removing modified page "
+ "0x%08x.%08x from service",
+ mmu_ptob(pp->p_pagenum));
+ }
+ if (page_retire_transient_ue(pp)) {
+ PR_DEBUG(prd_uescrubbed);
+ return (page_retire_done(pp, PRD_UE_SCRUBBED));
+ } else {
+ PR_DEBUG(prd_uenotscrubbed);
+ page_retire_destroy(pp);
+ return (page_retire_done(pp, PRD_SUCCESS));
+ }
+ } else if (toxic & PR_FMA) {
+ PR_DEBUG(prd_fma);
+ page_retire_destroy(pp);
+ return (page_retire_done(pp, PRD_SUCCESS));
+ } else if (toxic & PR_MCE) {
+ PR_DEBUG(prd_mce);
+ page_retire_destroy(pp);
+ return (page_retire_done(pp, PRD_SUCCESS));
+ }
+ panic("page_retire_pp: bad toxic flags %d", toxic);
+ /*NOTREACHED*/
+}
+
+/*
+ * Try to retire a page when we stumble onto it in the page lock routines.
+ */
+void
+page_tryretire(page_t *pp)
+{
+ ASSERT(PAGE_EXCL(pp));
+
+ if (!pr_enable) {
+ page_unlock(pp);
+ return;
+ }
+
+ /*
+ * If the page is a big page, try to break it up.
+ *
+ * If there are other bad pages besides `pp', they will be
+ * recursively retired for us thanks to a bit of magic.
+ * If the page is a small page with errors, try to retire it.
+ */
+ if (pp->p_szc > 0) {
+ if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) {
+ page_unlock(pp);
+ PR_DEBUG(prd_nofreedemote);
+ return;
+ } else if (!page_try_demote_pages(pp)) {
+ page_unlock(pp);
+ PR_DEBUG(prd_nodemote);
+ return;
+ }
+ PR_DEBUG(prd_demoted);
+ page_unlock(pp);
+ } else {
+ (void) page_retire_pp(pp);
+ }
+}
+
+/*
+ * page_retire() - the front door in to retire a page.
+ *
+ * Ideally, page_retire() would instantly retire the requested page.
+ * Unfortunately, some pages are locked or otherwise tied up and cannot be
+ * retired right away. To deal with that, bits are set in p_toxic of the
+ * page_t. An attempt is made to lock the page; if the attempt is successful,
+ * we instantly unlock the page counting on page_unlock() to notice p_toxic
+ * is nonzero and to call back into page_retire_pp(). Success is determined
+ * by looking to see whether the page has been retired once it has been
+ * unlocked.
+ *
+ * Returns:
+ *
+ * - 0 on success,
+ * - EINVAL when the PA is whacko,
+ * - EBUSY if the page is already retired, or
+ * - EAGAIN if the page could not be _immediately_ retired.
+ */
+int
+page_retire(uint64_t pa, uchar_t reason)
+{
+ page_t *pp;
+
+ ASSERT(reason & PR_REASONS); /* there must be a reason */
+ ASSERT(!(reason & ~PR_REASONS)); /* but no other bits */
+
+ pp = page_numtopp_nolock(mmu_btop(pa));
+ if (pp == NULL) {
+ PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on"
+ " page 0x%08x.%08x; page is not relocatable memory", pa);
+ return (page_retire_done(pp, PRD_INVALID_PA));
+ }
+ if (PP_RETIRED(pp)) {
+ return (page_retire_done(pp, PRD_DUPLICATE));
+ }
+
+ if (reason & PR_UE) {
+ PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on"
+ " page 0x%08x.%08x", pa);
+ } else {
+ PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
+ " page 0x%08x.%08x", pa);
+ }
+ page_settoxic(pp, reason);
+ page_retire_enqueue(pp);
+
+ /*
+ * And now for some magic.
+ *
+ * We marked this page toxic up above. All there is left to do is
+ * to try to lock the page and then unlock it. The page lock routines
+ * will intercept the page and retire it if they can. If the page
+ * cannot be locked, 's okay -- page_unlock() will eventually get it,
+ * or the background thread, until then the lock routines will deny
+ * further locks on it.
+ */
+ if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) {
+ PR_DEBUG(prd_prlocked);
+ page_unlock(pp);
+ } else {
+ PR_DEBUG(prd_prnotlocked);
+ }
+
+ if (PP_RETIRED(pp)) {
+ PR_DEBUG(prd_prretired);
+ return (0);
+ } else {
+ cv_signal(&pr_cv);
+ PR_INCR_KSTAT(pr_failed);
+
+ if (pp->p_toxic & PR_MSG) {
+ return (page_retire_done(pp, PRD_FAILED));
+ } else {
+ return (page_retire_done(pp, PRD_PENDING));
+ }
+ }
+}
+
+/*
+ * Take a retired page off the retired-pages vnode and clear the toxic flags.
+ * If "free" is nonzero, lock it and put it back on the freelist. If "free"
+ * is zero, the caller already holds SE_EXCL lock so we simply unretire it
+ * and don't do anything else with it.
+ *
+ * Any unretire messages are printed from this routine.
+ *
+ * Returns 0 if page pp was unretired; else an error code.
+ */
+int
+page_unretire_pp(page_t *pp, int free)
+{
+ /*
+ * To be retired, a page has to be hashed onto the retired_pages vnode
+ * and have PR_RETIRED set in p_toxic.
+ */
+ if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
+ ASSERT(PAGE_EXCL(pp));
+ PR_DEBUG(prd_ulocked);
+ if (!PP_RETIRED(pp)) {
+ PR_DEBUG(prd_unotretired);
+ page_unlock(pp);
+ return (page_retire_done(pp, PRD_UNR_NOT));
+ }
+
+ PR_MESSAGE(CE_NOTE, 1, "unretiring retired"
+ " page 0x%08x.%08x", mmu_ptob(pp->p_pagenum));
+ if (pp->p_toxic & PR_FMA) {
+ PR_DECR_KSTAT(pr_fma);
+ } else if (pp->p_toxic & PR_UE) {
+ PR_DECR_KSTAT(pr_ue);
+ } else {
+ PR_DECR_KSTAT(pr_mce);
+ }
+ page_clrtoxic(pp, PR_ALLFLAGS);
+
+ if (free) {
+ PR_DEBUG(prd_udestroy);
+ page_destroy(pp, 0);
+ } else {
+ PR_DEBUG(prd_uhashout);
+ page_hashout(pp, NULL);
+ }
+
+ mutex_enter(&freemem_lock);
+ availrmem++;
+ mutex_exit(&freemem_lock);
+
+ PR_DEBUG(prd_uunretired);
+ PR_DECR_KSTAT(pr_retired);
+ PR_INCR_KSTAT(pr_unretired);
+ return (page_retire_done(pp, PRD_UNR_SUCCESS));
+ }
+ PR_DEBUG(prd_unotlocked);
+ return (page_retire_done(pp, PRD_UNR_CANTLOCK));
+}
+
+/*
+ * Return a page to service by moving it from the retired_pages vnode
+ * onto the freelist.
+ *
+ * Called from mmioctl_page_retire() on behalf of the FMA DE.
+ *
+ * Returns:
+ *
+ * - 0 if the page is unretired,
+ * - EAGAIN if the pp can not be locked,
+ * - EINVAL if the PA is whacko, and
+ * - EBADF if the pp is not retired.
+ */
+int
+page_unretire(uint64_t pa)
+{
+ page_t *pp;
+
+ pp = page_numtopp_nolock(mmu_btop(pa));
+ if (pp == NULL) {
+ return (page_retire_done(pp, PRD_INVALID_PA));
+ }
+
+ return (page_unretire_pp(pp, 1));
+}
+
+/*
+ * Test a page to see if it is retired. If errors is non-NULL, the toxic
+ * bits of the page are returned. Returns 0 on success, error code on failure.
+ */
+int
+page_retire_check_pp(page_t *pp, uint64_t *errors)
+{
+ int rc;
+
+ if (PP_RETIRED(pp)) {
+ PR_DEBUG(prd_checkhit);
+ rc = 0;
+ } else {
+ PR_DEBUG(prd_checkmiss);
+ rc = EAGAIN;
+ }
+
+ /*
+ * We have magically arranged the bit values returned to fmd(1M)
+ * to line up with the FMA, MCE, and UE bits of the page_t.
+ */
+ if (errors) {
+ uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK);
+ if (toxic & PR_UE_SCRUBBED) {
+ toxic &= ~PR_UE_SCRUBBED;
+ toxic |= PR_UE;
+ }
+ *errors = toxic;
+ }
+
+ return (rc);
+}
+
+/*
+ * Test to see if the page_t for a given PA is retired, and return the
+ * hardware errors we have seen on the page if requested.
+ *
+ * Called from mmioctl_page_retire on behalf of the FMA DE.
+ *
+ * Returns:
+ *
+ * - 0 if the page is retired,
+ * - EAGAIN if it is not, and
+ * - EINVAL if the PA is whacko.
+ */
+int
+page_retire_check(uint64_t pa, uint64_t *errors)
+{
+ page_t *pp;
+
+ if (errors) {
+ *errors = 0;
+ }
+
+ pp = page_numtopp_nolock(mmu_btop(pa));
+ if (pp == NULL) {
+ return (page_retire_done(pp, PRD_INVALID_PA));
+ }
+
+ return (page_retire_check_pp(pp, errors));
+}
+
+/*
+ * Page retire self-test. For now, it always returns 0.
+ */
+int
+page_retire_test(void)
+{
+ page_t *first, *pp, *cpp, *cpp2, *lpp;
+
+ /*
+ * Tests the corner case where a large page can't be retired
+ * because one of the constituent pages is locked. We mark
+ * one page to be retired and try to retire it, and mark the
+ * other page to be retired but don't try to retire it, so
+ * that page_unlock() in the failure path will recurse and try
+ * to retire THAT page. This is the worst possible situation
+ * we can get ourselves into.
+ */
+ memsegs_lock(0);
+ pp = first = page_first();
+ do {
+ if (pp->p_szc && PP_PAGEROOT(pp) == pp) {
+ cpp = pp + 1;
+ lpp = PP_ISFREE(pp)? pp : pp + 2;
+ cpp2 = pp + 3;
+ if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED))
+ continue;
+ if (!page_trylock(cpp, SE_EXCL)) {
+ page_unlock(lpp);
+ continue;
+ }
+ page_settoxic(cpp, PR_FMA | PR_BUSY);
+ page_settoxic(cpp2, PR_FMA);
+ page_tryretire(cpp); /* will fail */
+ page_unlock(lpp);
+ (void) page_retire(cpp->p_pagenum, PR_FMA);
+ (void) page_retire(cpp2->p_pagenum, PR_FMA);
+ }
+ } while ((pp = page_next(pp)) != first);
+ memsegs_unlock(0);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 5b3db34db1..27b2702d28 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -87,90 +87,6 @@ static pgcnt_t max_page_get; /* max page_get request size in pages */
pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */
/*
- * vnode for all pages which are retired from the VM system;
- * such as pages with Uncorrectable Errors.
- */
-struct vnode retired_ppages;
-
-static void page_retired_init(void);
-static void retired_dispose(vnode_t *vp, page_t *pp, int flag,
- int dn, cred_t *cr);
-static void retired_inactive(vnode_t *vp, cred_t *cr);
-static void page_retired(page_t *pp);
-static void retired_page_removed(page_t *pp);
-void page_unretire_pages(void);
-
-/*
- * The maximum number of pages that will be unretired in one iteration.
- * This number is totally arbitrary.
- */
-#define UNRETIRE_PAGES 256
-
-/*
- * We limit the number of pages that may be retired to
- * a percentage of the total physical memory. Note that
- * the percentage values are stored as 'basis points',
- * ie, 100 basis points is 1%.
- */
-#define MAX_PAGES_RETIRED_BPS_DEFAULT 10 /* .1% */
-
-uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT;
-
-static int pages_retired_limit_exceeded(void);
-
-/*
- * operations vector for vnode with retired pages. Only VOP_DISPOSE
- * and VOP_INACTIVE are intercepted.
- */
-struct vnodeops retired_vnodeops = {
- "retired_vnodeops",
- fs_nosys, /* open */
- fs_nosys, /* close */
- fs_nosys, /* read */
- fs_nosys, /* write */
- fs_nosys, /* ioctl */
- fs_nosys, /* setfl */
- fs_nosys, /* getattr */
- fs_nosys, /* setattr */
- fs_nosys, /* access */
- fs_nosys, /* lookup */
- fs_nosys, /* create */
- fs_nosys, /* remove */
- fs_nosys, /* link */
- fs_nosys, /* rename */
- fs_nosys, /* mkdir */
- fs_nosys, /* rmdir */
- fs_nosys, /* readdir */
- fs_nosys, /* symlink */
- fs_nosys, /* readlink */
- fs_nosys, /* fsync */
- retired_inactive,
- fs_nosys, /* fid */
- fs_rwlock, /* rwlock */
- fs_rwunlock, /* rwunlock */
- fs_nosys, /* seek */
- fs_nosys, /* cmp */
- fs_nosys, /* frlock */
- fs_nosys, /* space */
- fs_nosys, /* realvp */
- fs_nosys, /* getpage */
- fs_nosys, /* putpage */
- fs_nosys_map,
- fs_nosys_addmap,
- fs_nosys, /* delmap */
- fs_nosys_poll,
- fs_nosys, /* dump */
- fs_nosys, /* l_pathconf */
- fs_nosys, /* pageio */
- fs_nosys, /* dumpctl */
- retired_dispose,
- fs_nosys, /* setsecattr */
- fs_nosys, /* getsecatt */
- fs_nosys, /* shrlock */
- fs_vnevent_nosupport /* vnevent */
-};
-
-/*
* freemem_lock protects all freemem variables:
* availrmem. Also this lock protects the globals which track the
* availrmem changes for accurate kernel footprint calculation.
@@ -289,15 +205,6 @@ static kcondvar_t pcgs_cv; /* cv for delay in pcgs */
#define PAGE_LOCK_MAXIMUM \
((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1)
-/*
- * Control over the verbosity of page retirement. When set to zero, no messages
- * will be printed. A value of one will trigger messages for retirement
- * operations, and is intended for processors which don't yet support FMA
- * (spitfire). Two will cause verbose messages to be printed when retirements
- * complete, and is intended only for debugging purposes.
- */
-int page_retire_messages = 0;
-
#ifdef VM_STATS
/*
@@ -440,11 +347,7 @@ vm_init(void)
(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
page_init_mem_config();
-
- /*
- * initialise the vnode for retired pages
- */
- page_retired_init();
+ page_retire_init();
}
/*
@@ -2799,153 +2702,6 @@ page_free(page_t *pp, int dontneed)
ASSERT((PAGE_EXCL(pp) &&
!page_iolock_assert(pp)) || panicstr);
- if (page_deteriorating(pp)) {
- volatile int i = 0;
- char *kaddr;
- volatile int rb, wb;
- uint64_t pa;
- volatile int ue = 0;
- on_trap_data_t otd;
-
- if (pp->p_vnode != NULL) {
- /*
- * Let page_destroy() do its bean counting and
- * hash out the page; it will then call back
- * into page_free() with pp->p_vnode == NULL.
- */
- page_destroy(pp, 0);
- return;
- }
-
- if (page_isfailing(pp)) {
- /*
- * If we have already exceeded the limit for
- * pages retired, we will treat this page as
- * 'toxic' rather than failing. That will ensure
- * that the page is at least cleaned, and if
- * a UE is detected, the page will be retired
- * anyway.
- */
- if (pages_retired_limit_exceeded()) {
- /*
- * clear the flag and reset to toxic
- */
- page_clrtoxic(pp);
- page_settoxic(pp, PAGE_IS_TOXIC);
- } else {
- pa = ptob((uint64_t)page_pptonum(pp));
- if (page_retire_messages) {
- cmn_err(CE_NOTE, "Page 0x%08x.%08x "
- "removed from service",
- (uint32_t)(pa >> 32), (uint32_t)pa);
- }
- goto page_failed;
- }
- }
-
- pagescrub(pp, 0, PAGESIZE);
-
- /*
- * We want to determine whether the error that occurred on
- * this page is transient or persistent, so we get a mapping
- * to the page and try every possible bit pattern to compare
- * what we write with what we read back. A smaller number
- * of bit patterns might suffice, but there's no point in
- * getting fancy. If this is the hot path on your system,
- * you've got bigger problems.
- */
- kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
- for (wb = 0xff; wb >= 0; wb--) {
- if (on_trap(&otd, OT_DATA_EC)) {
- pa = ptob((uint64_t)page_pptonum(pp)) + i;
- page_settoxic(pp, PAGE_IS_FAILING);
-
- if (page_retire_messages) {
- cmn_err(CE_WARN, "Uncorrectable Error "
- "occurred at PA 0x%08x.%08x while "
- "attempting to clear previously "
- "reported error; page removed from "
- "service", (uint32_t)(pa >> 32),
- (uint32_t)pa);
- }
-
- ue++;
- break;
- }
-
- /*
- * Write out the bit pattern, flush it to memory, and
- * read it back while under on_trap() protection.
- */
- for (i = 0; i < PAGESIZE; i++)
- kaddr[i] = wb;
-
- sync_data_memory(kaddr, PAGESIZE);
-
- for (i = 0; i < PAGESIZE; i++) {
- if ((rb = (uchar_t)kaddr[i]) != wb) {
- page_settoxic(pp, PAGE_IS_FAILING);
- goto out;
- }
- }
- }
-out:
- no_trap();
- ppmapout(kaddr);
-
- if (wb >= 0 && !ue) {
- pa = ptob((uint64_t)page_pptonum(pp)) + i;
- if (page_retire_messages) {
- cmn_err(CE_WARN, "Data Mismatch occurred at PA "
- "0x%08x.%08x [ 0x%x != 0x%x ] while "
- "attempting to clear previously reported "
- "error; page removed from service",
- (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb);
- }
- }
-page_failed:
- /*
- * DR operations change the association between a page_t
- * and the physical page it represents. Check if the
- * page is still bad. If it is, then retire it.
- */
- if (page_isfaulty(pp) && page_isfailing(pp)) {
- /*
- * In the future, it might be useful to have a platform
- * callback here to tell the hardware to fence off this
- * page during the next reboot.
- *
- * We move the page to the retired_vnode here
- */
- (void) page_hashin(pp, &retired_ppages,
- (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL);
- mutex_enter(&freemem_lock);
- availrmem--;
- mutex_exit(&freemem_lock);
- page_retired(pp);
- page_downgrade(pp);
-
- /*
- * If DR raced with the above page retirement code,
- * we might have retired a good page. If so, unretire
- * the page.
- */
- if (!page_isfaulty(pp))
- page_unretire_pages();
- return;
- }
-
- pa = ptob((uint64_t)page_pptonum(pp));
-
- if (page_retire_messages) {
- cmn_err(CE_NOTE, "Previously reported error on page "
- "0x%08x.%08x cleared", (uint32_t)(pa >> 32),
- (uint32_t)pa);
- }
-
- page_clrtoxic(pp);
- }
-
if (PP_ISFREE(pp)) {
panic("page_free: page %p is free", (void *)pp);
}
@@ -3089,7 +2845,6 @@ page_free_pages(page_t *pp)
pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
pgcnt_t i;
uint_t szc = pp->p_szc;
- int toxic = 0;
VM_STAT_ADD(pagecnt.pc_free_pages);
TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
@@ -3118,9 +2873,6 @@ page_free_pages(page_t *pp)
ASSERT(tpp->p_vnode == NULL);
ASSERT(tpp->p_szc == szc);
- if (page_deteriorating(tpp))
- toxic = 1;
-
PP_SETFREE(tpp);
page_clr_all_props(tpp);
PP_SETAGED(tpp);
@@ -3131,10 +2883,6 @@ page_free_pages(page_t *pp)
}
ASSERT(rootpp == pp);
- if (toxic) {
- page_free_toxic_pages(rootpp);
- return;
- }
page_list_add_pages(rootpp, 0);
page_create_putback(pgcnt);
}
@@ -3219,12 +2967,13 @@ page_reclaim(page_t *pp, kmutex_t *lock)
struct pcf *p;
uint_t pcf_index;
struct cpu *cpup;
- int enough;
uint_t i;
+ pgcnt_t npgs, need, collected;
ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
- ASSERT(pp->p_szc == 0);
+
+ npgs = page_get_pagecnt(pp->p_szc);
/*
* If `freemem' is 0, we cannot reclaim this page from the
@@ -3254,18 +3003,19 @@ page_reclaim(page_t *pp, kmutex_t *lock)
goto page_reclaim_nomem;
}
- enough = 0;
+ collected = 0;
pcf_index = PCF_INDEX();
p = &pcf[pcf_index];
p->pcf_touch = 1;
mutex_enter(&p->pcf_lock);
- if (p->pcf_count >= 1) {
- enough = 1;
- p->pcf_count--;
+ if (p->pcf_count >= npgs) {
+ collected = npgs;
+ p->pcf_count -= npgs;
}
mutex_exit(&p->pcf_lock);
+ need = npgs - collected;
- if (!enough) {
+ if (need > 0) {
VM_STAT_ADD(page_reclaim_zero);
/*
* Check again. Its possible that some other thread
@@ -3277,15 +3027,22 @@ page_reclaim(page_t *pp, kmutex_t *lock)
for (i = 0; i < PCF_FANOUT; i++) {
p->pcf_touch = 1;
mutex_enter(&p->pcf_lock);
- if (p->pcf_count >= 1) {
- p->pcf_count -= 1;
- enough = 1;
- break;
+ if (p->pcf_count) {
+ if (p->pcf_count >= need) {
+ p->pcf_count -= need;
+ collected += need;
+ need = 0;
+ break;
+ } else if (p->pcf_count) {
+ collected += p->pcf_count;
+ need -= p->pcf_count;
+ p->pcf_count = 0;
+ }
}
p++;
}
- if (!enough) {
+ if (need > 0) {
page_reclaim_nomem:
/*
* We really can't have page `pp'.
@@ -3309,6 +3066,7 @@ page_reclaim_nomem:
mutex_enter(&new_freemem_lock);
p = pcf;
+ p->pcf_count += collected;
for (i = 0; i < PCF_FANOUT; i++) {
p->pcf_wait++;
mutex_exit(&p->pcf_lock);
@@ -3328,11 +3086,13 @@ page_reclaim_nomem:
}
/*
- * There was a page to be found.
+ * We beat the PCF bins over the head until
+ * we got the memory that we wanted.
* The pcf accounting has been done,
* though none of the pcf_wait flags have been set,
* drop the locks and continue on.
*/
+ ASSERT(collected == npgs);
while (p >= pcf) {
mutex_exit(&p->pcf_lock);
p--;
@@ -3343,14 +3103,19 @@ page_reclaim_nomem:
* freemem is not protected by any lock. Thus, we cannot
* have any assertion containing freemem here.
*/
- freemem -= 1;
+ freemem -= npgs;
VM_STAT_ADD(pagecnt.pc_reclaim);
if (PP_ISAGED(pp)) {
- page_list_sub(pp, PG_FREE_LIST);
+ if (npgs > 1) {
+ page_list_sub_pages(pp, pp->p_szc);
+ } else {
+ page_list_sub(pp, PG_FREE_LIST);
+ }
TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
"page_reclaim_free:pp %p", pp);
} else {
+ ASSERT(npgs == 1);
page_list_sub(pp, PG_CACHE_LIST);
TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
"page_reclaim_cache:pp %p", pp);
@@ -3363,9 +3128,11 @@ page_reclaim_nomem:
*
* Set the reference bit to protect against immediate pageout.
*/
- PP_CLRFREE(pp);
- PP_CLRAGED(pp);
- page_set_props(pp, P_REF);
+ for (i = 0; i < npgs; i++, pp = page_next(pp)) {
+ PP_CLRFREE(pp);
+ PP_CLRAGED(pp);
+ page_set_props(pp, P_REF);
+ }
CPU_STATS_ENTER_K();
cpup = CPU; /* get cpup now that CPU cannot change */
@@ -3441,7 +3208,6 @@ page_destroy_pages(page_t *pp)
pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
pgcnt_t i, pglcks = 0;
uint_t szc = pp->p_szc;
- int toxic = 0;
ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
@@ -3471,9 +3237,6 @@ page_destroy_pages(page_t *pp)
ASSERT(tpp->p_vnode == NULL);
ASSERT(tpp->p_szc == szc);
- if (page_deteriorating(tpp))
- toxic = 1;
-
PP_SETFREE(tpp);
page_clr_all_props(tpp);
PP_SETAGED(tpp);
@@ -3489,10 +3252,6 @@ page_destroy_pages(page_t *pp)
mutex_exit(&freemem_lock);
}
- if (toxic) {
- page_free_toxic_pages(rootpp);
- return;
- }
page_list_add_pages(rootpp, 0);
page_create_putback(pgcnt);
}
@@ -3916,14 +3675,6 @@ page_hashout(page_t *pp, kmutex_t *phm)
mutex_exit(nphm);
/*
- * If the page was retired, update the pages_retired
- * total and clear the page flag
- */
- if (page_isretired(pp)) {
- retired_page_removed(pp);
- }
-
- /*
* Wake up processes waiting for this page. The page's
* identity has been changed, and is probably not the
* desired page any longer.
@@ -5397,6 +5148,63 @@ page_release(page_t *pp, int checkmod)
return (status);
}
+/*
+ * Given a constituent page, try to demote the large page on the freelist.
+ *
+ * Returns nonzero if the page could be demoted successfully. Returns with
+ * the constituent page still locked.
+ */
+int
+page_try_demote_free_pages(page_t *pp)
+{
+ page_t *rootpp = pp;
+ pfn_t pfn = page_pptonum(pp);
+ spgcnt_t npgs;
+ uint_t szc = pp->p_szc;
+
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PAGE_EXCL(pp));
+
+ /*
+ * Adjust rootpp and lock it, if `pp' is not the base
+ * constituent page.
+ */
+ npgs = page_get_pagecnt(pp->p_szc);
+ if (npgs == 1) {
+ return (0);
+ }
+
+ if (!IS_P2ALIGNED(pfn, npgs)) {
+ pfn = P2ALIGN(pfn, npgs);
+ rootpp = page_numtopp_nolock(pfn);
+ }
+
+ if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
+ return (0);
+ }
+
+ if (rootpp->p_szc != szc) {
+ if (pp != rootpp)
+ page_unlock(rootpp);
+ return (0);
+ }
+
+ page_demote_free_pages(rootpp);
+
+ if (pp != rootpp)
+ page_unlock(rootpp);
+
+ ASSERT(PP_ISFREE(pp));
+ ASSERT(PAGE_EXCL(pp));
+ return (1);
+}
+
+/*
+ * Given a constituent page, try to demote the large page.
+ *
+ * Returns nonzero if the page could be demoted successfully. Returns with
+ * the constituent page still locked.
+ */
int
page_try_demote_pages(page_t *pp)
{
@@ -5406,27 +5214,27 @@ page_try_demote_pages(page_t *pp)
uint_t szc = pp->p_szc;
vnode_t *vp = pp->p_vnode;
- ASSERT(PAGE_EXCL(rootpp));
+ ASSERT(PAGE_EXCL(pp));
VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
- if (rootpp->p_szc == 0) {
+ if (pp->p_szc == 0) {
VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
return (1);
}
if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) {
VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
- page_demote_vp_pages(rootpp);
+ page_demote_vp_pages(pp);
ASSERT(pp->p_szc == 0);
return (1);
}
/*
- * Adjust rootpp if passed in is not the base
+ * Adjust rootpp if passed in is not the base
* constituent page.
*/
- npgs = page_get_pagecnt(rootpp->p_szc);
+ npgs = page_get_pagecnt(pp->p_szc);
ASSERT(npgs > 1);
if (!IS_P2ALIGNED(pfn, npgs)) {
pfn = P2ALIGN(pfn, npgs);
@@ -5455,12 +5263,11 @@ page_try_demote_pages(page_t *pp)
break;
ASSERT(tpp->p_szc == rootpp->p_szc);
ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
- (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
}
/*
- * If we failed to lock them all then unlock what we have locked
- * so far and bail.
+ * If we failed to lock them all then unlock what we have
+ * locked so far and bail.
*/
if (i < npgs) {
tpp = rootpp;
@@ -5473,12 +5280,9 @@ page_try_demote_pages(page_t *pp)
return (0);
}
- /*
- * XXX probably p_szc clearing and page unlocking can be done within
- * one loop but since this is rare code we can play very safe.
- */
for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
ASSERT(PAGE_EXCL(tpp));
+ (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
tpp->p_szc = 0;
}
@@ -5490,6 +5294,7 @@ page_try_demote_pages(page_t *pp)
if (tpp != pp)
page_unlock(tpp);
}
+
VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
return (1);
}
@@ -5579,221 +5384,6 @@ page_demote_vp_pages(page_t *pp)
}
/*
- * Page retire operation.
- *
- * page_retire()
- * Attempt to retire (throw away) page pp. We cannot do this if
- * the page is dirty; if the page is clean, we can try. We return 0 on
- * success, -1 on failure. This routine should be invoked by the platform's
- * memory error detection code.
- *
- * pages_retired_limit_exceeded()
- * We set a limit on the number of pages which may be retired. This
- * is set to a percentage of total physical memory. This limit is
- * enforced here.
- */
-
-static pgcnt_t retired_pgcnt = 0;
-
-/*
- * routines to update the count of retired pages
- */
-static void
-page_retired(page_t *pp)
-{
- ASSERT(pp);
-
- page_settoxic(pp, PAGE_IS_RETIRED);
- atomic_add_long(&retired_pgcnt, 1);
-}
-
-static void
-retired_page_removed(page_t *pp)
-{
- ASSERT(pp);
- ASSERT(page_isretired(pp));
- ASSERT(retired_pgcnt > 0);
-
- page_clrtoxic(pp);
- atomic_add_long(&retired_pgcnt, -1);
-}
-
-
-static int
-pages_retired_limit_exceeded()
-{
- pgcnt_t retired_max;
-
- /*
- * If the percentage is zero or is not set correctly,
- * return TRUE so that pages are not retired.
- */
- if (max_pages_retired_bps <= 0 ||
- max_pages_retired_bps >= 10000)
- return (1);
-
- /*
- * Calculate the maximum number of pages allowed to
- * be retired as a percentage of total physical memory
- * (Remember that we are using basis points, hence the 10000.)
- */
- retired_max = (physmem * max_pages_retired_bps) / 10000;
-
- /*
- * return 'TRUE' if we have already retired more
- * than the legal limit
- */
- return (retired_pgcnt >= retired_max);
-}
-
-#define PAGE_RETIRE_SELOCK 0
-#define PAGE_RETIRE_NORECLAIM 1
-#define PAGE_RETIRE_LOCKED 2
-#define PAGE_RETIRE_COW 3
-#define PAGE_RETIRE_DIRTY 4
-#define PAGE_RETIRE_LPAGE 5
-#define PAGE_RETIRE_SUCCESS 6
-#define PAGE_RETIRE_LIMIT 7
-#define PAGE_RETIRE_NCODES 8
-
-typedef struct page_retire_op {
- int pr_count;
- short pr_unlock;
- short pr_retval;
- char *pr_message;
-} page_retire_op_t;
-
-page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = {
- { 0, 0, -1, "cannot lock page" },
- { 0, 0, -1, "cannot reclaim cached page" },
- { 0, 1, -1, "page is locked" },
- { 0, 1, -1, "copy-on-write page" },
- { 0, 1, -1, "page is dirty" },
- { 0, 1, -1, "cannot demote large page" },
- { 0, 0, 0, "page successfully retired" },
- { 0, 0, -1, "excess pages retired already" },
-};
-
-static int
-page_retire_done(page_t *pp, int code)
-{
- page_retire_op_t *prop = &page_retire_ops[code];
-
- prop->pr_count++;
-
- if (prop->pr_unlock)
- page_unlock(pp);
-
- if (page_retire_messages > 1) {
- printf("page_retire(%p) pfn 0x%lx %s: %s\n",
- (void *)pp, page_pptonum(pp),
- prop->pr_retval == -1 ? "failed" : "succeeded",
- prop->pr_message);
- }
-
- return (prop->pr_retval);
-}
-
-int
-page_retire(page_t *pp, uchar_t flag)
-{
- uint64_t pa = ptob((uint64_t)page_pptonum(pp));
-
- ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC);
-
- /*
- * DR operations change the association between a page_t
- * and the physical page it represents. Check if the
- * page is still bad.
- */
- if (!page_isfaulty(pp)) {
- page_clrtoxic(pp);
- return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
- }
-
- /*
- * We set the flag here so that even if we fail due
- * to exceeding the limit for retired pages, the
- * page will still be checked and either cleared
- * or retired in page_free().
- */
- page_settoxic(pp, flag);
-
- if (flag == PAGE_IS_TOXIC) {
- if (page_retire_messages) {
- cmn_err(CE_NOTE, "Scheduling clearing of error on"
- " page 0x%08x.%08x",
- (uint32_t)(pa >> 32), (uint32_t)pa);
- }
-
- } else { /* PAGE_IS_FAILING */
- if (pages_retired_limit_exceeded()) {
- /*
- * Return as we have already exceeded the
- * maximum number of pages allowed to be
- * retired
- */
- return (page_retire_done(pp, PAGE_RETIRE_LIMIT));
- }
-
- if (page_retire_messages) {
- cmn_err(CE_NOTE, "Scheduling removal of "
- "page 0x%08x.%08x",
- (uint32_t)(pa >> 32), (uint32_t)pa);
- }
- }
-
- if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL))
- return (page_retire_done(pp, PAGE_RETIRE_SELOCK));
-
- /*
- * If this is a large page we first try and demote it
- * to PAGESIZE pages and then dispose of the toxic page.
- * On failure we will let the page free/destroy
- * code handle it later since this is a mapped page.
- * Note that free large pages can always be demoted.
- *
- */
- if (pp->p_szc != 0) {
- if (PP_ISFREE(pp))
- (void) page_demote_free_pages(pp);
- else
- (void) page_try_demote_pages(pp);
-
- if (pp->p_szc != 0)
- return (page_retire_done(pp, PAGE_RETIRE_LPAGE));
- }
-
- if (PP_ISFREE(pp)) {
- if (!page_reclaim(pp, NULL))
- return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM));
- /*LINTED: constant in conditional context*/
- VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred)
- return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
- }
-
- if (pp->p_lckcnt != 0)
- return (page_retire_done(pp, PAGE_RETIRE_LOCKED));
-
- if (pp->p_cowcnt != 0)
- return (page_retire_done(pp, PAGE_RETIRE_COW));
-
- /*
- * Unload all translations to this page. No new translations
- * can be created while we hold the exclusive lock on the page.
- */
- (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
-
- if (hat_ismod(pp))
- return (page_retire_done(pp, PAGE_RETIRE_DIRTY));
-
- /*LINTED: constant in conditional context*/
- VN_DISPOSE(pp, B_INVAL, 0, kcred);
-
- return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
-}
-
-/*
* Mark any existing pages for migration in the given range
*/
void
@@ -6128,140 +5718,6 @@ next:
}
}
-/*
- * initialize the vnode for retired pages
- */
-static void
-page_retired_init(void)
-{
- vn_setops(&retired_ppages, &retired_vnodeops);
-}
-
-/* ARGSUSED */
-static void
-retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr)
-{
- panic("retired_dispose invoked");
-}
-
-/* ARGSUSED */
-static void
-retired_inactive(vnode_t *vp, cred_t *cr)
-{}
-
-void
-page_unretire_pages(void)
-{
- page_t *pp;
- kmutex_t *vphm;
- vnode_t *vp;
- page_t *rpages[UNRETIRE_PAGES];
- pgcnt_t i, npages, rmem;
- uint64_t pa;
-
- rmem = 0;
-
- for (;;) {
- /*
- * We do this in 2 steps:
- *
- * 1. We walk the retired pages list and collect a list of
- * pages that have the toxic field cleared.
- *
- * 2. We iterate through the page list and unretire each one.
- *
- * We have to do it in two steps on account of the mutexes that
- * we need to acquire.
- */
-
- vp = &retired_ppages;
- vphm = page_vnode_mutex(vp);
- mutex_enter(vphm);
-
- if ((pp = vp->v_pages) == NULL) {
- mutex_exit(vphm);
- break;
- }
-
- i = 0;
- do {
- ASSERT(pp != NULL);
- ASSERT(pp->p_vnode == vp);
-
- /*
- * DR operations change the association between a page_t
- * and the physical page it represents. Check if the
- * page is still bad. If not, unretire it.
- */
- if (!page_isfaulty(pp))
- rpages[i++] = pp;
-
- pp = pp->p_vpnext;
- } while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES));
-
- mutex_exit(vphm);
-
- npages = i;
- for (i = 0; i < npages; i++) {
- pp = rpages[i];
- pa = ptob((uint64_t)page_pptonum(pp));
-
- /*
- * Need to upgrade the shared lock to an exclusive
- * lock in order to hash out the page.
- *
- * The page could have been retired but the page lock
- * may not have been downgraded yet. If so, skip this
- * page. page_free() will call this function after the
- * lock is downgraded.
- */
-
- if (!PAGE_SHARED(pp) || !page_tryupgrade(pp))
- continue;
-
- /*
- * Both page_free() and DR call this function. They
- * can potentially call this function at the same
- * time and race with each other.
- */
- if (!page_isretired(pp) || page_isfaulty(pp)) {
- page_downgrade(pp);
- continue;
- }
-
- cmn_err(CE_NOTE,
- "unretiring retired page 0x%08x.%08x",
- (uint32_t)(pa >> 32), (uint32_t)pa);
-
- /*
- * When a page is removed from the retired pages vnode,
- * its toxic field is also cleared. So, we do not have
- * to do that seperately here.
- */
- page_hashout(pp, (kmutex_t *)NULL);
-
- /*
- * This is a good page. So, free it.
- */
- pp->p_vnode = NULL;
- page_free(pp, 1);
- rmem++;
- }
-
- /*
- * If the rpages array was filled up, then there could be more
- * retired pages that are not faulty. We need to iterate
- * again and unretire them. Otherwise, we are done.
- */
- if (npages < UNRETIRE_PAGES)
- break;
- }
-
- mutex_enter(&freemem_lock);
- availrmem += rmem;
- mutex_exit(&freemem_lock);
-}
-
ulong_t mem_waiters = 0;
ulong_t max_count = 20;
#define MAX_DELAY 0x1ff
@@ -6621,90 +6077,48 @@ page_clr_all_props(page_t *pp)
}
/*
- * The following functions is called from free_vp_pages()
- * for an inexact estimate of a newly free'd page...
+ * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
*/
-ulong_t
-page_share_cnt(page_t *pp)
-{
- return (hat_page_getshare(pp));
-}
-
-/*
- * The following functions are used in handling memory
- * errors.
- */
-
-int
-page_istoxic(page_t *pp)
-{
- return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC);
-}
-
-int
-page_isfailing(page_t *pp)
-{
- return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING);
-}
-
-int
-page_isretired(page_t *pp)
-{
- return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED);
-}
-
int
-page_deteriorating(page_t *pp)
+page_clear_lck_cow(page_t *pp, int adjust)
{
- return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0);
-}
+ int f_amount;
-void
-page_settoxic(page_t *pp, uchar_t flag)
-{
- uchar_t new_flag = 0;
- while ((new_flag & flag) != flag) {
- uchar_t old_flag = pp->p_toxic;
- new_flag = old_flag | flag;
- (void) cas8(&pp->p_toxic, old_flag, new_flag);
- new_flag = ((volatile page_t *)pp)->p_toxic;
- }
-}
+ ASSERT(PAGE_EXCL(pp));
-void
-page_clrtoxic(page_t *pp)
-{
/*
- * We don't need to worry about atomicity on the
- * p_toxic flag here as this is only called from
- * page_free() while holding an exclusive lock on
- * the page
+ * The page_struct_lock need not be acquired here since
+ * we require the caller hold the page exclusively locked.
*/
- pp->p_toxic = PAGE_IS_OK;
-}
+ f_amount = 0;
+ if (pp->p_lckcnt) {
+ f_amount = 1;
+ pp->p_lckcnt = 0;
+ }
+ if (pp->p_cowcnt) {
+ f_amount += pp->p_cowcnt;
+ pp->p_cowcnt = 0;
+ }
-void
-page_clrtoxic_flag(page_t *pp, uchar_t flag)
-{
- uchar_t new_flag = ((volatile page_t *)pp)->p_toxic;
- while ((new_flag & flag) == flag) {
- uchar_t old_flag = new_flag;
- new_flag = old_flag & ~flag;
- (void) cas8(&pp->p_toxic, old_flag, new_flag);
- new_flag = ((volatile page_t *)pp)->p_toxic;
+ if (adjust && f_amount) {
+ mutex_enter(&freemem_lock);
+ availrmem += f_amount;
+ mutex_exit(&freemem_lock);
}
-}
-int
-page_isfaulty(page_t *pp)
-{
- return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY);
+ return (f_amount);
}
/*
- * The following four functions are called from /proc code
- * for the /proc/<pid>/xmap interface.
+ * The following functions is called from free_vp_pages()
+ * for an inexact estimate of a newly free'd page...
*/
+ulong_t
+page_share_cnt(page_t *pp)
+{
+ return (hat_page_getshare(pp));
+}
+
int
page_isshared(page_t *pp)
{
diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c
index 225e8d157f..994ddca8a6 100644
--- a/usr/src/uts/common/vm/vm_pagelist.c
+++ b/usr/src/uts/common/vm/vm_pagelist.c
@@ -1305,7 +1305,7 @@ page_list_add_pages(page_t *pp, int flags)
kcage_freemem_add(pgcnt);
#endif
for (i = 0; i < pgcnt; i++, pp++)
- page_unlock(pp);
+ page_unlock_noretire(pp);
}
}
@@ -1753,7 +1753,7 @@ page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
phm = PAGE_HASH_MUTEX(index);
if (!mutex_tryenter(phm)) {
- page_unlock(pp);
+ page_unlock_noretire(pp);
goto fail_promote;
}
@@ -1761,7 +1761,7 @@ page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
page_hashout(pp, phm);
mutex_exit(phm);
PP_SETAGED(pp);
- page_unlock(pp);
+ page_unlock_noretire(pp);
which_list = PG_CACHE_LIST;
}
page_ctr_sub(mnode, mtype, pp, which_list);
@@ -2209,7 +2209,7 @@ page_trylock_cons(page_t *pp, se_t se)
* have locked so far.
*/
while (first_pp != tpp) {
- page_unlock(first_pp);
+ page_unlock_noretire(first_pp);
first_pp = first_pp->p_next;
}
return (0);
@@ -2575,7 +2575,7 @@ skipptcpcheck:
while (--i != (pgcnt_t)-1) {
pp = &spp[i];
ASSERT(PAGE_EXCL(pp));
- page_unlock(pp);
+ page_unlock_noretire(pp);
}
return (0);
}
@@ -2584,7 +2584,7 @@ skipptcpcheck:
!PP_ISFREE(pp)) {
VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
ASSERT(i == 0);
- page_unlock(pp);
+ page_unlock_noretire(pp);
return (0);
}
if (PP_ISNORELOC(pp)) {
@@ -2592,7 +2592,7 @@ skipptcpcheck:
while (i != (pgcnt_t)-1) {
pp = &spp[i];
ASSERT(PAGE_EXCL(pp));
- page_unlock(pp);
+ page_unlock_noretire(pp);
i--;
}
return (0);
@@ -2687,7 +2687,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
*/
while (pgcnt--) {
ASSERT(PAGE_EXCL(pp));
- page_unlock(pp);
+ page_unlock_noretire(pp);
pp++;
}
/*
@@ -2702,7 +2702,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
ASSERT(PP_ISAGED(pp));
pp->p_szc = 0;
page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
- page_unlock(pp);
+ page_unlock_noretire(pp);
}
if (replpp != NULL)
@@ -2734,7 +2734,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
page_sub(&replpp, rpp);
ASSERT(PAGE_EXCL(rpp));
ASSERT(!PP_ISFREE(rpp));
- page_unlock(rpp);
+ page_unlock_noretire(rpp);
}
ASSERT(targpp == hpp);
ASSERT(replpp == NULL);
diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c
index 1d6cd158b4..5071dae5ee 100644
--- a/usr/src/uts/i86pc/os/machdep.c
+++ b/usr/src/uts/i86pc/os/machdep.c
@@ -226,6 +226,8 @@ mdboot(int cmd, int fcn, char *mdep, boolean_t invoke_cb)
if (invoke_cb)
(void) callb_execute_class(CB_CL_MDBOOT, NULL);
+ page_retire_hunt(page_retire_mdboot_cb);
+
/*
* stop other cpus and raise our priority. since there is only
* one active cpu after this, and our priority will be too high
diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c
index 35d97e6b23..75f57ce39c 100644
--- a/usr/src/uts/sun4u/cpu/spitfire.c
+++ b/usr/src/uts/sun4u/cpu/spitfire.c
@@ -432,6 +432,7 @@ void
cpu_setup(void)
{
extern int page_retire_messages;
+ extern int page_retire_first_ue;
extern int at_flags;
#if defined(SF_ERRATA_57)
extern caddr_t errata57_limit;
@@ -445,9 +446,11 @@ cpu_setup(void)
/*
* Spitfire isn't currently FMA-aware, so we have to enable the
- * page retirement messages.
+ * page retirement messages. We also change the default policy
+ * for UE retirement to allow clearing of transient errors.
*/
page_retire_messages = 1;
+ page_retire_first_ue = 0;
/*
* save the cache bootup state.
@@ -895,10 +898,7 @@ cpu_ce_error(struct regs *rp, ulong_t p_afar, ulong_t p_afsr,
curthread->t_ontrap != NULL) {
if (curthread->t_ontrap->ot_prot & OT_DATA_EC) {
- page_t *pp = page_numtopp_nolock((pfn_t)
- (ecc->flt_addr >> MMU_PAGESHIFT));
-
- if (pp != NULL && page_isretired(pp)) {
+ if (page_retire_check(ecc->flt_addr, NULL) == 0) {
queue = 0;
}
}
@@ -1093,6 +1093,7 @@ cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep)
char unum[UNUM_NAMLEN];
int len = 0;
int ce_verbose = 0;
+ int err;
ASSERT(ecc->flt_func != NULL);
@@ -1107,15 +1108,9 @@ cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep)
* Count errors per unum.
* Non-memory errors are all counted via a special unum string.
*/
- if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING &&
+ if ((err = ce_count_unum(ecc->flt_status, len, unum)) != PR_OK &&
automatic_page_removal) {
- page_t *pp = page_numtopp_nolock((pfn_t)
- (ecc->flt_addr >> MMU_PAGESHIFT));
-
- if (pp) {
- page_settoxic(pp, PAGE_IS_FAULTY);
- (void) page_retire(pp, PAGE_IS_FAILING);
- }
+ (void) page_retire(ecc->flt_addr, err);
}
if (ecc->flt_panic) {
@@ -2092,11 +2087,7 @@ cpu_async_log_err(void *flt)
if (!panicstr &&
(aflt->flt_stat & S_AFSR_ALL_ERRS) == P_AFSR_UE &&
aflt->flt_prot == AFLT_PROT_EC) {
- page_t *pp = page_numtopp_nolock((pfn_t)
- (aflt->flt_addr >> MMU_PAGESHIFT));
-
- if (pp != NULL && page_isretired(pp)) {
-
+ if (page_retire_check(aflt->flt_addr, NULL) == 0) {
/* Zero the address to clear the error */
softcall(ecc_page_zero, (void *)aflt->flt_addr);
return;
@@ -2305,25 +2296,7 @@ cpu_async_log_err(void *flt)
if (aflt->flt_addr != AFLT_INV_ADDR && aflt->flt_in_memory) {
if (!panicstr) {
- /*
- * Retire the bad page that caused the error
- */
- page_t *pp = page_numtopp_nolock((pfn_t)
- (aflt->flt_addr >> MMU_PAGESHIFT));
-
- if (pp != NULL) {
- page_settoxic(pp, PAGE_IS_FAULTY);
- (void) page_retire(pp, PAGE_IS_TOXIC);
- } else {
- uint64_t pa =
- P2ALIGN(aflt->flt_addr, MMU_PAGESIZE);
-
- cpu_aflt_log(CE_CONT, 3, spf_flt,
- CPU_ERRID_FIRST, NULL,
- ": cannot schedule clearing of error on "
- "page 0x%08x.%08x; page not in VM system",
- (uint32_t)(pa >> 32), (uint32_t)pa);
- }
+ (void) page_retire(aflt->flt_addr, PR_UE);
} else {
/*
* Clear UEs on panic so that we don't
@@ -4089,12 +4062,7 @@ static void
ecache_page_retire(void *arg)
{
uint64_t paddr = (uint64_t)arg;
- page_t *pp = page_numtopp_nolock((pfn_t)(paddr >> MMU_PAGESHIFT));
-
- if (pp) {
- page_settoxic(pp, PAGE_IS_FAULTY);
- (void) page_retire(pp, PAGE_IS_TOXIC);
- }
+ (void) page_retire(paddr, PR_UE);
}
void
@@ -4331,15 +4299,14 @@ add_leaky_bucket_timeout(void)
* false intermittents, so these intermittents can be safely ignored.
*
* If the error count is excessive for a DIMM, this function will return
- * PAGE_IS_FAILING, and the CPU module may then decide to remove that page
- * from use.
+ * PR_MCE, and the CPU module may then decide to remove that page from use.
*/
static int
ce_count_unum(int status, int len, char *unum)
{
int i;
struct ce_info *psimm = mem_ce_simm;
- int page_status = PAGE_IS_OK;
+ int page_status = PR_OK;
ASSERT(psimm != NULL);
@@ -4375,7 +4342,7 @@ ce_count_unum(int status, int len, char *unum)
cmn_err(CE_WARN,
"[AFT0] Sticky Softerror encountered "
"on Memory Module %s\n", unum);
- page_status = PAGE_IS_FAILING;
+ page_status = PR_MCE;
} else if (status & ECC_PERSISTENT) {
psimm[i].leaky_bucket_cnt = 1;
psimm[i].intermittent_total = 0;
@@ -4404,7 +4371,7 @@ ce_count_unum(int status, int len, char *unum)
cmn_err(CE_WARN,
"[AFT0] Sticky Softerror encountered "
"on Memory Module %s\n", unum);
- page_status = PAGE_IS_FAILING;
+ page_status = PR_MCE;
} else if (status & ECC_PERSISTENT) {
int new_value;
@@ -4422,7 +4389,7 @@ ce_count_unum(int status, int len, char *unum)
ecc_softerr_interval % 60);
atomic_add_16(
&psimm[i].leaky_bucket_cnt, -1);
- page_status = PAGE_IS_FAILING;
+ page_status = PR_MCE;
}
} else { /* Intermittent */
psimm[i].intermittent_total++;
@@ -4444,15 +4411,11 @@ ce_count_unum(int status, int len, char *unum)
void
cpu_ce_count_unum(struct async_flt *ecc, int len, char *unum)
{
- if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING &&
- automatic_page_removal) {
- page_t *pp = page_numtopp_nolock((pfn_t)
- (ecc->flt_addr >> MMU_PAGESHIFT));
+ int err;
- if (pp) {
- page_settoxic(pp, PAGE_IS_FAULTY);
- (void) page_retire(pp, PAGE_IS_FAILING);
- }
+ err = ce_count_unum(ecc->flt_status, len, unum);
+ if (err != PR_OK && automatic_page_removal) {
+ (void) page_retire(ecc->flt_addr, err);
}
}
diff --git a/usr/src/uts/sun4u/cpu/us3_common.c b/usr/src/uts/sun4u/cpu/us3_common.c
index f8d8b2bb77..f7cc35c664 100644
--- a/usr/src/uts/sun4u/cpu/us3_common.c
+++ b/usr/src/uts/sun4u/cpu/us3_common.c
@@ -2205,7 +2205,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
{
ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt;
struct async_flt *aflt = (struct async_flt *)flt;
- page_t *pp;
+ uint64_t errors;
switch (ch_flt->flt_type) {
case CPU_INV_AFSR:
@@ -2236,9 +2236,6 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
*/
case CPU_CE:
case CPU_EMC:
- pp = page_numtopp_nolock((pfn_t)
- (aflt->flt_addr >> MMU_PAGESHIFT));
-
/*
* We want to skip logging and further classification
* only if ALL the following conditions are true:
@@ -2258,7 +2255,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
(C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE &&
aflt->flt_prot == AFLT_PROT_EC) {
- if (pp != NULL && page_isretired(pp)) {
+ if (page_retire_check(aflt->flt_addr, NULL) == 0) {
if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
/*
@@ -2289,17 +2286,17 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
*
* Note: Check cpu_impl_async_log_err if changing this
*/
- if (pp) {
- if (page_isretired(pp) || page_deteriorating(pp)) {
+ if (page_retire_check(aflt->flt_addr, &errors) == EINVAL) {
+ CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+ CE_XDIAG_SKIP_NOPP);
+ } else {
+ if (errors != PR_OK) {
CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
CE_XDIAG_SKIP_PAGEDET);
} else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep,
offsetof(ch_async_flt_t, cmn_asyncflt))) {
return (0);
}
- } else {
- CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
- CE_XDIAG_SKIP_NOPP);
}
/*FALLTHRU*/
@@ -2325,11 +2322,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
if (!panicstr && (ch_flt->afsr_errs &
(C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_UE &&
aflt->flt_prot == AFLT_PROT_EC) {
- page_t *pp = page_numtopp_nolock((pfn_t)
- (aflt->flt_addr >> MMU_PAGESHIFT));
-
- if (pp != NULL && page_isretired(pp)) {
-
+ if (page_retire_check(aflt->flt_addr, NULL) == 0) {
/* Zero the address to clear the error */
softcall(ecc_page_zero, (void *)aflt->flt_addr);
return (0);
@@ -2387,12 +2380,7 @@ void
cpu_page_retire(ch_async_flt_t *ch_flt)
{
struct async_flt *aflt = (struct async_flt *)ch_flt;
- page_t *pp = page_numtopp_nolock(aflt->flt_addr >> MMU_PAGESHIFT);
-
- if (pp != NULL) {
- page_settoxic(pp, PAGE_IS_FAULTY);
- (void) page_retire(pp, PAGE_IS_TOXIC);
- }
+ (void) page_retire(aflt->flt_addr, PR_UE);
}
/*
diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno.c b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
index cd71848200..0b7936d426 100644
--- a/usr/src/uts/sun4u/cpu/us3_jalapeno.c
+++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
@@ -316,7 +316,7 @@ cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep)
{
ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt;
struct async_flt *aflt = (struct async_flt *)flt;
- page_t *pp;
+ uint64_t errors;
switch (ch_flt->flt_type) {
@@ -329,19 +329,15 @@ cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep)
return (CH_ASYNC_LOG_DONE);
case CPU_RCE:
- pp = page_numtopp_nolock((pfn_t)
- (aflt->flt_addr >> MMU_PAGESHIFT));
- if (pp) {
- if (page_isretired(pp) || page_deteriorating(pp)) {
- CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
- CE_XDIAG_SKIP_PAGEDET);
- } else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep,
- offsetof(ch_async_flt_t, cmn_asyncflt))) {
- return (CH_ASYNC_LOG_RECIRC);
- }
- } else {
+ if (page_retire_check(aflt->flt_addr, &errors) == EINVAL) {
CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
CE_XDIAG_SKIP_NOPP);
+ } else if (errors != PR_OK) {
+ CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+ CE_XDIAG_SKIP_PAGEDET);
+ } else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep,
+ offsetof(ch_async_flt_t, cmn_asyncflt))) {
+ return (CH_ASYNC_LOG_RECIRC);
}
/*FALLTHRU*/
/*
diff --git a/usr/src/uts/sun4u/io/pci/pci_ecc.c b/usr/src/uts/sun4u/io/pci/pci_ecc.c
index 0f92d73663..8820683ba6 100644
--- a/usr/src/uts/sun4u/io/pci/pci_ecc.c
+++ b/usr/src/uts/sun4u/io/pci/pci_ecc.c
@@ -534,21 +534,21 @@ ecc_err_handler(ecc_errstate_t *ecc_err_p)
* Called from ecc_err_drain below for CBINTR_CE case.
*/
static int
-ecc_err_cexdiag(page_t *pp, ecc_errstate_t *ecc_err,
- errorq_elem_t *eqep)
+ecc_err_cexdiag(ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
{
struct async_flt *ecc = &ecc_err->ecc_aflt;
+ uint64_t errors;
- if (!pp) {
+ if (page_retire_check(ecc->flt_addr, &errors) == EINVAL) {
CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP);
return (0);
- } else if (page_isretired(pp) || page_deteriorating(pp)) {
+ } else if (errors != PR_OK) {
CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET);
return (0);
+ } else {
+ return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep,
+ offsetof(ecc_errstate_t, ecc_aflt)));
}
-
- return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep,
- offsetof(ecc_errstate_t, ecc_aflt)));
}
/*
@@ -561,7 +561,6 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
{
struct async_flt *ecc = &ecc_err->ecc_aflt;
pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0];
- page_t *pp;
int ecc_type = ecc_err->ecc_ii_p.ecc_type;
if (pci_p == NULL)
@@ -581,13 +580,10 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ?
ECC_IO_UE : ECC_IO_CE);
- pp = page_numtopp_nolock(ecc->flt_addr >> MMU_PAGESHIFT);
-
switch (ecc_type) {
case CBNINTR_UE:
- if (pp && ecc_err->ecc_pg_ret == 1) {
- page_settoxic(pp, PAGE_IS_FAULTY);
- (void) page_retire(pp, PAGE_IS_TOXIC);
+ if (ecc_err->ecc_pg_ret == 1) {
+ (void) page_retire(ecc->flt_addr, PR_UE);
}
ecc_err->ecc_err_type = flt_to_error_type(ecc);
break;
@@ -609,7 +605,7 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
/* ecc_err_cexdiag returns nonzero to recirculate */
if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) &&
- ecc_err_cexdiag(pp, ecc_err, eqep))
+ ecc_err_cexdiag(ecc_err, eqep))
return;
ecc_err->ecc_err_type = flt_to_error_type(ecc);
break;
diff --git a/usr/src/uts/sun4u/ngdr/io/dr_mem.c b/usr/src/uts/sun4u/ngdr/io/dr_mem.c
index e876db93b5..1dd67f5824 100644
--- a/usr/src/uts/sun4u/ngdr/io/dr_mem.c
+++ b/usr/src/uts/sun4u/ngdr/io/dr_mem.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -80,8 +80,6 @@ static struct memlist *memlist_del_span(struct memlist *mlist,
static struct memlist *memlist_cat_span(struct memlist *mlist,
uint64_t base, uint64_t len);
-extern void page_unretire_pages(void);
-
/*
* dr_mem_unit_t.sbm_flags
*/
@@ -427,57 +425,13 @@ dr_mem_ecache_scrub(dr_mem_unit_t *mp, struct memlist *mlist)
#endif /* DEBUG */
}
-/*
- * This function marks as clean, all the faulty pages that belong to the
- * board that is copy-renamed since they are not likely to be bad pages
- * after the rename. This includes the retired pages on the board.
- */
-
-static void
-dr_memlist_clrpages(struct memlist *r_ml)
-{
- struct memlist *t_ml;
- page_t *pp, *epp;
- pfn_t pfn, epfn;
- struct memseg *seg;
-
- if (r_ml == NULL)
- return;
-
- for (t_ml = r_ml; (t_ml != NULL); t_ml = t_ml->next) {
- pfn = _b64top(t_ml->address);
- epfn = _b64top(t_ml->address + t_ml->size);
-
- for (seg = memsegs; seg != NULL; seg = seg->next) {
- if (pfn >= seg->pages_end || epfn < seg->pages_base)
- continue;
-
- pp = seg->pages;
- if (pfn > seg->pages_base)
- pp += pfn - seg->pages_base;
-
- epp = seg->epages;
- if (epfn < seg->pages_end)
- epp -= seg->pages_end - epfn;
-
- ASSERT(pp < epp);
- while (pp < epp) {
- if (page_isfaulty((page_t *)pp))
- page_clrtoxic_flag((page_t *)pp,
- PAGE_IS_FAULTY);
- pp++;
- }
- }
- }
-}
-
static int
dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
{
time_t copytime;
drmachid_t cr_id;
dr_sr_handle_t *srhp;
- struct memlist *c_ml, *d_ml, *r_ml;
+ struct memlist *c_ml, *d_ml;
sbd_error_t *err;
static fn_t f = "dr_move_memory";
@@ -507,11 +461,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
d_ml = d_ml->next;
}
- /*
- * create a copy of the memlist to be used for retiring pages.
- */
- r_ml = memlist_dup(c_ml);
-
affinity_set(drmach_mem_cpu_affinity(t_mp->sbm_cm.sbdev_id));
err = drmach_copy_rename_init(
@@ -520,7 +469,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
if (err) {
DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
affinity_clear();
- memlist_delete(r_ml);
return (-1);
}
@@ -553,7 +501,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
hp->h_err = NULL;
affinity_clear();
- memlist_delete(r_ml);
return (-1);
}
@@ -573,12 +520,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
drmach_copy_rename(cr_id);
- /*
- * Clear pages that have been marked as faulty since we are
- * changing the physical memory for the pages.
- */
- dr_memlist_clrpages(r_ml);
-
/* Resume the OS. */
dr_resume(srhp);
@@ -594,11 +535,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
PR_MEM("%s: copy-rename elapsed time = %ld ticks (%ld secs)\n",
f, copytime, copytime / hz);
- memlist_delete(r_ml);
-
- /* Unretire any pages cleared after copy-rename */
- page_unretire_pages();
-
/* return -1 if dr_suspend or copy/rename recorded an error */
return (err == NULL ? 0 : -1);
}
diff --git a/usr/src/uts/sun4u/os/ecc.c b/usr/src/uts/sun4u/os/ecc.c
index 10b6cb523f..bd933377e4 100644
--- a/usr/src/uts/sun4u/os/ecc.c
+++ b/usr/src/uts/sun4u/os/ecc.c
@@ -247,23 +247,21 @@ error_init(void)
}
/*
- * Success flags for ecc_page_zero
+ * Flags for ecc_page_zero DTrace probe since ecc_page_zero() is called
+ * as a softint handler.
*/
#define PAGE_ZERO_SUCCESS 0
#define PAGE_ZERO_FAIL_NOLOCK 1
#define PAGE_ZERO_FAIL_ONTRAP 2
-/*
- * arg is a physical address - zero out the page that contains it
- */
void
ecc_page_zero(void *arg)
{
uint64_t pa = (uint64_t)arg;
- page_t *pp = page_numtopp_nolock((pfn_t)(pa >> MMU_PAGESHIFT));
int ret, success_flag;
+ page_t *pp = page_numtopp_nolock(mmu_btop(pa));
- if (pp == NULL || !page_isretired(pp))
+ if (page_retire_check(pa, NULL) != 0)
return;
/*
diff --git a/usr/src/uts/sun4u/os/mach_cpu_states.c b/usr/src/uts/sun4u/os/mach_cpu_states.c
index 0815f54170..4144c91c79 100644
--- a/usr/src/uts/sun4u/os/mach_cpu_states.c
+++ b/usr/src/uts/sun4u/os/mach_cpu_states.c
@@ -66,7 +66,6 @@ extern int disable_watchdog_on_exit;
void
mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
{
- page_t *first, *pp;
extern void pm_cfb_check_and_powerup(void);
/*
@@ -79,25 +78,6 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
}
/*
- * Clear any unresolved UEs from memory. We rely on the fact that on
- * sun4u, pagezero() will always clear UEs. Since we're rebooting, we
- * just force p_selock to appear locked so pagezero()'s assert works.
- *
- * Pages that were retired successfully due to multiple CEs will
- * also be cleared.
- */
- if (memsegs != NULL) {
- pp = first = page_first();
- do {
- if (page_isretired(pp) || page_istoxic(pp)) {
- /* pagezero asserts PAGE_LOCKED */
- pp->p_selock = -1;
- pagezero(pp, 0, PAGESIZE);
- }
- } while ((pp = page_next(pp)) != first);
- }
-
- /*
* XXX - rconsvp is set to NULL to ensure that output messages
* are sent to the underlying "hardware" device using the
* monitor's printf routine since we are in the process of
@@ -123,6 +103,12 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
(void) callb_execute_class(CB_CL_MDBOOT, NULL);
/*
+ * Clear any unresolved UEs from memory.
+ */
+ if (memsegs != NULL)
+ page_retire_hunt(page_retire_mdboot_cb);
+
+ /*
* stop other cpus which also raise our priority. since there is only
* one active cpu after this, and our priority will be too high
* for us to be preempted, we're essentially single threaded
diff --git a/usr/src/uts/sun4v/os/error.c b/usr/src/uts/sun4v/os/error.c
index 9d13b1781b..bd2b7fde49 100644
--- a/usr/src/uts/sun4v/os/error.c
+++ b/usr/src/uts/sun4v/os/error.c
@@ -87,8 +87,7 @@ static uint32_t rq_overflow_count = 0; /* counter for rq overflow */
static void cpu_queue_one_event(errh_async_flt_t *);
static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t);
-static void errh_page_settoxic(errh_async_flt_t *, uchar_t);
-static void errh_page_retire(errh_async_flt_t *);
+static void errh_page_retire(errh_async_flt_t *, uchar_t);
static int errh_error_protected(struct regs *, struct async_flt *, int *);
static void errh_rq_full(struct async_flt *);
static void ue_drain(void *, struct async_flt *, errorq_elem_t *);
@@ -300,12 +299,10 @@ process_nonresumable_error(struct regs *rp, uint64_t tl,
}
/*
- * If it is a memory error, we turn on the PAGE_IS_TOXIC
- * flag. The page will be retired later and scrubbed when
- * it is freed.
+ * Call page_retire() to handle memory errors.
*/
if (errh_flt.errh_er.attr & ERRH_ATTR_MEM)
- (void) errh_page_settoxic(&errh_flt, PAGE_IS_TOXIC);
+ errh_page_retire(&errh_flt, PR_UE);
/*
* If we queued an error and the it was in user mode or
@@ -443,10 +440,10 @@ cpu_async_log_err(void *flt)
case ERRH_DESC_UCOR_RE:
if (errh_erp->attr & ERRH_ATTR_MEM) {
/*
- * Turn on the PAGE_IS_TOXIC flag. The page will be
+ * Turn on the PR_UE flag. The page will be
* scrubbed when it is freed.
*/
- (void) errh_page_settoxic(errh_fltp, PAGE_IS_TOXIC);
+ errh_page_retire(errh_fltp, PR_UE);
}
break;
@@ -458,7 +455,7 @@ cpu_async_log_err(void *flt)
* For non-resumable memory error, retire
* the page here.
*/
- errh_page_retire(errh_fltp);
+ errh_page_retire(errh_fltp, PR_UE);
/*
* If we are going to panic, scrub the page first
@@ -518,9 +515,8 @@ cpu_ue_log_err(struct async_flt *aflt)
* Turn on flag on the error memory region.
*/
static void
-errh_page_settoxic(errh_async_flt_t *errh_fltp, uchar_t flag)
+errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag)
{
- page_t *pp;
uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
uint64_t flt_real_addr_end = flt_real_addr_start +
errh_fltp->errh_er.sz - 1;
@@ -531,38 +527,7 @@ errh_page_settoxic(errh_async_flt_t *errh_fltp, uchar_t flag)
for (current_addr = flt_real_addr_start;
current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
- pp = page_numtopp_nolock((pfn_t)
- (current_addr >> MMU_PAGESHIFT));
-
- if (pp != NULL) {
- page_settoxic(pp, flag);
- }
- }
-}
-
-/*
- * Retire the page(s) indicated in the error report.
- */
-static void
-errh_page_retire(errh_async_flt_t *errh_fltp)
-{
- page_t *pp;
- uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
- uint64_t flt_real_addr_end = flt_real_addr_start +
- errh_fltp->errh_er.sz - 1;
- int64_t current_addr;
-
- if (errh_fltp->errh_er.sz == 0)
- return;
-
- for (current_addr = flt_real_addr_start;
- current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
- pp = page_numtopp_nolock((pfn_t)
- (current_addr >> MMU_PAGESHIFT));
-
- if (pp != NULL) {
- (void) page_retire(pp, PAGE_IS_TOXIC);
- }
+ (void) page_retire(current_addr, flag);
}
}
diff --git a/usr/src/uts/sun4v/os/mach_cpu_states.c b/usr/src/uts/sun4v/os/mach_cpu_states.c
index 75e2421e69..46c1fdbeff 100644
--- a/usr/src/uts/sun4v/os/mach_cpu_states.c
+++ b/usr/src/uts/sun4v/os/mach_cpu_states.c
@@ -106,29 +106,9 @@ extern uint64_t get_cpuaddr(uint64_t, uint64_t);
void
mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
{
- page_t *first, *pp;
extern void pm_cfb_check_and_powerup(void);
/*
- * Clear any unresolved UEs from memory. We rely on the fact that on
- * sun4u, pagezero() will always clear UEs. Since we're rebooting, we
- * just force p_selock to appear locked so pagezero()'s assert works.
- *
- * Pages that were retired successfully due to multiple CEs will
- * also be cleared.
- */
- if (memsegs != NULL) {
- pp = first = page_first();
- do {
- if (page_isretired(pp) || page_istoxic(pp)) {
- /* pagezero asserts PAGE_LOCKED */
- pp->p_selock = -1;
- pagezero(pp, 0, PAGESIZE);
- }
- } while ((pp = page_next(pp)) != first);
- }
-
- /*
* XXX - rconsvp is set to NULL to ensure that output messages
* are sent to the underlying "hardware" device using the
* monitor's printf routine since we are in the process of
@@ -154,6 +134,12 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
(void) callb_execute_class(CB_CL_MDBOOT, NULL);
/*
+ * Clear any unresolved UEs from memory.
+ */
+ if (memsegs != NULL)
+ page_retire_hunt(page_retire_mdboot_cb);
+
+ /*
* stop other cpus which also raise our priority. since there is only
* one active cpu after this, and our priority will be too high
* for us to be preempted, we're essentially single threaded