19 files changed, 1986 insertions, 1150 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 57b3f1968f..32c38cdac3 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -51,6 +51,7 @@ COMMON_CORE_OBJS +=		\
 		lgrp_topo.o	\
 		mutex.o		\
 		page_lock.o	\
+		page_retire.o	\
 		panic.o		\
 		param.o		\
 		putnext.o	\
diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c
index 3aa34f9427..1e42907a5d 100644
--- a/usr/src/uts/common/io/mem.c
+++ b/usr/src/uts/common/io/mem.c
@@ -460,47 +460,51 @@ mmioctl_vtop(intptr_t data)
 }
 
 /*
- * Given a PA, retire that page or check whether it has already been retired.
+ * Given a PA, execute the given page retire command on it.
  */
 static int
 mmioctl_page_retire(int cmd, intptr_t data)
 {
+	extern int page_retire_test(void);
 	uint64_t pa;
-	pfn_t pfn;
-	page_t *pp;
 
-	if (copyin((void *)data, &pa, sizeof (uint64_t)))
+	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
 		return (EFAULT);
+	}
 
-	pfn = pa >> MMU_PAGESHIFT;
+	switch (cmd) {
+	case MEM_PAGE_ISRETIRED:
+		return (page_retire_check(pa, NULL));
 
-	if (!pf_is_memory(pfn) || (pp = page_numtopp_nolock(pfn)) == NULL)
-		return (EINVAL);
+	case MEM_PAGE_UNRETIRE:
+		return (page_unretire(pa));
 
-	/*
-	 * If we're checking, see if the page is retired; if not, confirm that
-	 * its status is at least set to be failing.  If neither, return EIO.
-	 */
-	if (cmd == MEM_PAGE_ISRETIRED) {
-		if (page_isretired(pp))
-			return (0);
+	case MEM_PAGE_RETIRE:
+		return (page_retire(pa, PR_FMA));
 
-		if (!page_isfailing(pp))
-			return (EIO);
+	case MEM_PAGE_RETIRE_MCE:
+		return (page_retire(pa, PR_MCE));
 
-		return (EAGAIN);
-	}
+	case MEM_PAGE_RETIRE_UE:
+		return (page_retire(pa, PR_UE));
 
-	/*
-	 * Try to retire the page. If the retire fails, it will be scheduled to
-	 * occur when the page is freed.  If this page is out of circulation
-	 * already, or is in the process of being retired, we fail.
-	 */
-	if (page_isretired(pp) || page_isfailing(pp))
-		return (EIO);
+	case MEM_PAGE_GETERRORS:
+		{
+			uint64_t page_errors;
+			int rc = page_retire_check(pa, &page_errors);
+			if (copyout(&page_errors, (void *)data,
+			    sizeof (uint64_t))) {
+				return (EFAULT);
+			}
+			return (rc);
+		}
+
+	case MEM_PAGE_RETIRE_TEST:
+		return (page_retire_test());
+
+	}
 
-	page_settoxic(pp, PAGE_IS_FAULTY);
-	return (page_retire(pp, PAGE_IS_FAILING) ? EAGAIN : 0);
+	return (EINVAL);
 }
 
 #ifdef __sparc
@@ -606,6 +610,11 @@ mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
 
 	case MEM_PAGE_RETIRE:
 	case MEM_PAGE_ISRETIRED:
+	case MEM_PAGE_UNRETIRE:
+	case MEM_PAGE_RETIRE_MCE:
+	case MEM_PAGE_RETIRE_UE:
+	case MEM_PAGE_GETERRORS:
+	case MEM_PAGE_RETIRE_TEST:
 		if (getminor(dev) != M_MEM)
 			return (ENXIO);
 		return (mmioctl_page_retire(cmd, data));
diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c
index 0d29cc59d6..8f398ac602 100644
--- a/usr/src/uts/common/os/mem_config.c
+++ b/usr/src/uts/common/os/mem_config.c
@@ -1770,31 +1770,13 @@ delete_memory_thread(caddr_t amhp)
 				}
 
 				if (!page_try_reclaim_lock(pp, SE_EXCL,
-				    SE_EXCL_WANTED)) {
-					if (page_isretired(pp)) {
-						/*
-						 * Page has been retired.
-						 *
-						 * Its shared lock can and
-						 * must be upgraded to an
-						 * exclusive lock in order
-						 * to hashout the page when
-						 * the delete completes.
-						 */
-						page_lock_clr_exclwanted(pp);
-						if (!page_tryupgrade(pp)) {
-							mutex_enter(
-							    &mhp->mh_mutex);
-							continue;
-						}
-					} else {
-						/*
-						 * Page in use elsewhere.
-						 */
-						MDSTAT_INCR(mhp, lockfail);
-						mutex_enter(&mhp->mh_mutex);
-						continue;
-					}
+				    SE_EXCL_WANTED | SE_RETIRED)) {
+					/*
+					 * Page in use elsewhere.  Skip it.
+					 */
+					MDSTAT_INCR(mhp, lockfail);
+					mutex_enter(&mhp->mh_mutex);
+					continue;
 				}
 				/*
 				 * See if the cage expanded into the delete.
@@ -1802,15 +1784,12 @@ delete_memory_thread(caddr_t amhp)
 				 * cage to expand.
 				 */
 				if (PP_ISNORELOC(pp)) {
-					if (page_isretired(pp))
-						page_downgrade(pp);
-					else
-						page_unlock(pp);
+					page_unlock(pp);
 					mutex_enter(&mhp->mh_mutex);
 					mhp->mh_cancel = KPHYSM_ENONRELOC;
 					break;
 				}
-				if (page_isretired(pp)) {
+				if (PP_RETIRED(pp)) {
 					/*
 					 * Page has been retired and is
 					 * not part of the cage so we
@@ -1861,11 +1840,11 @@ delete_memory_thread(caddr_t amhp)
 				}
 				/*
 				 * Keep stats on pages encountered that
-				 * are toxic or failing but not retired.
+				 * are marked for retirement.
 				 */
-				if (page_istoxic(pp)) {
+				if (PP_TOXIC(pp)) {
 					MDSTAT_INCR(mhp, toxic);
-				} else if (page_isfailing(pp)) {
+				} else if (PP_PR_REQ(pp)) {
 					MDSTAT_INCR(mhp, failing);
 				}
 				/*
@@ -1876,7 +1855,7 @@ delete_memory_thread(caddr_t amhp)
 				 * previously associated with the page.
 				 */
 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
-					if (!page_istoxic(pp)) {
+					if (!PP_TOXIC(pp)) {
 						/*
 						 * Must relocate locked in
 						 * memory pages.
@@ -1949,7 +1928,7 @@ delete_memory_thread(caddr_t amhp)
 #ifdef MEM_DEL_STATS
 				start_pgrp = ddi_get_lbolt();
 #endif /* MEM_DEL_STATS */
-				if (mod && !page_istoxic(pp)) {
+				if (mod && !PP_TOXIC(pp)) {
 					/*
 					 * Lock all constituent pages
 					 * of a large page to ensure
@@ -2020,7 +1999,7 @@ delete_memory_thread(caddr_t amhp)
 				 * set, we cannot do anything here to deal
 				 * with it.
 				 */
-				if (page_istoxic(pp)) {
+				if (PP_TOXIC(pp)) {
 					page_unlock(pp);
 #ifdef MEM_DEL_STATS
 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
@@ -2067,7 +2046,7 @@ delete_memory_thread(caddr_t amhp)
 					continue;
 				}
 				if (page_try_reclaim_lock(pp, SE_EXCL,
-				    SE_EXCL_WANTED)) {
+				    SE_EXCL_WANTED | SE_RETIRED)) {
 					if (PP_ISFREE(pp)) {
 						goto free_page_collect;
 					}
@@ -2229,12 +2208,8 @@ delete_memory_thread(caddr_t amhp)
 
 	/*
 	 * If the memory delete was cancelled, exclusive-wanted bits must
-	 * be cleared, and also any retired pages that
-	 * were accounted for above must have their exclusive lock
-	 * downgraded to a shared lock to return them to their previous
-	 * state.
-	 * Otherwise, if the memory delete has completed, retired pages
-	 * must be hashed out.
+	 * be cleared. If there are retired pages being deleted, they need
+	 * to be unretired.
 	 */
 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
 	    mdsp = mdsp->mds_next) {
@@ -2264,16 +2239,16 @@ delete_memory_thread(caddr_t amhp)
 					pp = page_numtopp_nolock(pfn);
 				}
 				ASSERT(pp != NULL);
-				ASSERT(page_isretired(pp));
+				ASSERT(PP_RETIRED(pp));
 				if (mhp->mh_cancel != 0) {
-					page_downgrade(pp);
+					page_unlock(pp);
 					/*
 					 * To satisfy ASSERT below in
 					 * cancel code.
 					 */
 					mhp->mh_hold_todo++;
 				} else {
-					page_hashout(pp, (kmutex_t *)NULL);
+					(void) page_unretire_pp(pp, 0);
 				}
 			}
 		}
diff --git a/usr/src/uts/common/sys/mem.h b/usr/src/uts/common/sys/mem.h
index e741d56b9f..f2b23b8029 100644
--- a/usr/src/uts/common/sys/mem.h
+++ b/usr/src/uts/common/sys/mem.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -60,11 +60,25 @@ typedef struct mem_vtop {
  * and drivers should not make use of these interfaces: they can change without
  * notice and programs that consume them will fail to run on future releases.
  */
-#define	MEM_PAGE_RETIRE		(('M' << 8) | 0x02)
-#define	MEM_PAGE_ISRETIRED	(('M' << 8) | 0x03)
 #define	MEM_NAME		(('M' << 8) | 0x04)
 #define	MEM_INFO		(('M' << 8) | 0x05)
 
+#define	MEM_PAGE_RETIRE		(('M' << 8) | 0x02)
+#define	MEM_PAGE_ISRETIRED	(('M' << 8) | 0x03)
+#define	MEM_PAGE_UNRETIRE	(('M' << 8) | 0x06)
+#define	MEM_PAGE_GETERRORS	(('M' << 8) | 0x07)
+#define	MEM_PAGE_RETIRE_MCE	(('M' << 8) | 0x08)
+#define	MEM_PAGE_RETIRE_UE	(('M' << 8) | 0x09)
+#define	MEM_PAGE_RETIRE_TEST	(('M' << 8) | 0x0A)
+
+/*
+ * Bits returned from MEM_PAGE_GETERRORS ioctl for use by fmd(1M).
+ */
+#define	MEM_PAGE_ERR_NONE	0x0
+#define	MEM_PAGE_ERR_MULTI_CE	0x1
+#define	MEM_PAGE_ERR_UE		0x2
+#define	MEM_PAGE_ERR_FMA_REQ	0x8
+
 typedef struct mem_name {
 	uint64_t	m_addr;		/* memory address */
 	uint64_t	m_synd;		/* architecture-specific syndrome */
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 2e4183bdc0..c1db6f1391 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -76,6 +76,12 @@ typedef enum {
  */
 #define	SE_EXCL_WANTED	0x02
 
+/*
+ * All page_*lock() requests will be denied unless this flag is set in
+ * the 'es' parameter.
+ */
+#define	SE_RETIRED	0x04
+
 #endif	/* _KERNEL | _KMEMUSER */
 
 typedef int	selock_t;
@@ -630,37 +636,6 @@ struct lgrp;
 #define	PG_LIST_ISCAGE	0x2000
 
 /*
- * Flags for setting the p_toxic flag when a page has errors
- * These flags may be OR'ed into the p_toxic page flag to
- * indicate that error(s) have occurred on a page,
- * (see page_settoxic()). If both PAGE_IS_TOXIC and
- * PAGE_IS_FAILING are set, PAGE_IS_FAILING takes precedence.
- *
- * When an error happens on a page, the trap handler sets
- * PAGE_IS_FAULTY on the page to indicate that an error has been
- * seen on the page. The error could be really a memory error or
- * something else (like a datapath error). When it is determined
- * that it is a memory error, the page is marked as PAGE_IS_TOXIC
- * or PAGE_IS_FAILING depending on the type of error and then
- * retired.
- *
- * We use the page's 'toxic' flag to determine whether the page
- * has just got a single error - PAGE_IS_TOXIC - or is being
- * retired due to multiple soft errors - PAGE_IS_FAILING. In
- * page_free(), a page that has been marked PAGE_IS_FAILING will
- * not be cleaned, it will always be retired. A page marked
- * PAGE_IS_TOXIC is cleaned and is retired only if this attempt at
- * cleaning fails.
- *
- * When a page has been successfully retired, we set PAGE_IS_RETIRED.
- */
-#define	PAGE_IS_OK		0x0
-#define	PAGE_IS_TOXIC		0x1
-#define	PAGE_IS_FAILING		0x2
-#define	PAGE_IS_RETIRED		0x4
-#define	PAGE_IS_FAULTY		0x8
-
-/*
  * Page frame operations.
  */
 page_t	*page_lookup(struct vnode *, u_offset_t, se_t);
@@ -707,6 +682,7 @@ void	page_boot_demote(page_t *);
 void	page_promote_size(page_t *, uint_t);
 void	page_list_add_pages(page_t *, int);
 void	page_list_sub(page_t *, int);
+void	page_list_sub_pages(page_t *, uint_t);
 void	page_list_xfer(page_t *, int, int);
 void	page_list_break(page_t **, page_t **, size_t);
 void	page_list_concat(page_t **, page_t **);
@@ -720,6 +696,7 @@ int	page_try_reclaim_lock(page_t *, se_t, int);
 int	page_tryupgrade(page_t *);
 void	page_downgrade(page_t *);
 void	page_unlock(page_t *);
+void	page_unlock_noretire(page_t *);
 void	page_lock_delete(page_t *);
 int	page_pp_lock(page_t *, int, int);
 void	page_pp_unlock(page_t *, int, int);
@@ -759,19 +736,22 @@ int	page_isfree(page_t *);
 int	page_isref(page_t *);
 int	page_ismod(page_t *);
 int	page_release(page_t *, int);
-int	page_retire(page_t *, uchar_t);
-int	page_istoxic(page_t *);
-int	page_isfailing(page_t *);
-int	page_isretired(page_t *);
-int	page_deteriorating(page_t *);
+void	page_retire_init(void);
+int	page_retire(uint64_t, uchar_t);
+int	page_retire_check(uint64_t, uint64_t *);
+int	page_unretire(uint64_t);
+int	page_unretire_pp(page_t *, int);
+void	page_tryretire(page_t *);
+void	page_retire_hunt(void (*)(page_t *));
+void	page_retire_mdboot_cb(page_t *);
+void	page_clrtoxic(page_t *, uchar_t);
 void	page_settoxic(page_t *, uchar_t);
-void	page_clrtoxic(page_t *);
-void	page_clrtoxic_flag(page_t *, uchar_t);
-int	page_isfaulty(page_t *);
+
 int	page_mem_avail(pgcnt_t);
 
 void page_set_props(page_t *, uint_t);
 void page_clr_all_props(page_t *);
+int page_clear_lck_cow(page_t *, int);
 
 kmutex_t	*page_vnode_mutex(struct vnode *);
 kmutex_t	*page_se_mutex(struct page *);
@@ -792,6 +772,7 @@ void page_free_replacement_page(page_t *);
 int page_relocate_cage(page_t **, page_t **);
 
 int page_try_demote_pages(page_t *);
+int page_try_demote_free_pages(page_t *);
 void page_demote_free_pages(page_t *);
 
 struct anon_map;
@@ -879,7 +860,56 @@ int	page_szc_user_filtered(size_t);
 #define	PP_CLRMIGRATE(pp)	((pp)->p_state &= ~P_MIGRATE)
 #define	PP_CLRSWAP(pp)		((pp)->p_state &= ~P_SWAP)
 
-
+/*
+ * Flags for page_t p_toxic, for tracking memory hardware errors.
+ *
+ * These flags are OR'ed into p_toxic with page_settoxic() to track which
+ * error(s) have occurred on a given page. The flags are cleared with
+ * page_clrtoxic(). Both page_settoxic() and page_cleartoxic use atomic
+ * primitives to manipulate the p_toxic field so no other locking is needed.
+ *
+ * When an error occurs on a page, p_toxic is set to record the error. The
+ * error could be a memory error or something else (i.e. a datapath). The Page
+ * Retire mechanism does not try to determine the exact cause of the error;
+ * Page Retire rightly leaves that sort of determination to FMA's Diagnostic
+ * Engine (DE).
+ *
+ * Note that, while p_toxic bits can be set without holding any locks, they
+ * should only be cleared while holding the page exclusively locked.
+ *
+ * Pages with PR_UE or PR_FMA flags are retired unconditionally, while pages
+ * with PR_MCE are retired if the system has not retired too many of them.
+ *
+ * A page must be exclusively locked to be retired. Pages can be retired if
+ * they are mapped, modified, or both, as long as they are not marked PR_UE,
+ * since pages with uncorrectable errors cannot be relocated in memory.
+ * Once a page has been successfully retired it is zeroed, attached to the
+ * retired_pages vnode and, finally, PR_RETIRED is set in p_toxic. The other
+ * p_toxic bits are NOT cleared. Pages are not left locked after retiring them
+ * to avoid special case code throughout the kernel; rather, page_*lock() will
+ * fail to lock the page, unless SE_RETIRED is passed as an argument.
+ *
+ * While we have your attention, go take a look at the comments at the
+ * beginning of page_retire.c too.
+ */
+#define	PR_OK		0x00	/* no problem */
+#define	PR_MCE		0x01	/* page has seen two or more CEs */
+#define	PR_UE		0x02	/* page has an unhandled UE */
+#define	PR_UE_SCRUBBED	0x04	/* page has seen a UE but was cleaned */
+#define	PR_FMA		0x08	/* A DE wants this page retired */
+#define	PR_RESV		0x10	/* Reserved for future use */
+#define	PR_BUSY		0x20	/* Page retire is in progress */
+#define	PR_MSG		0x40	/* message(s) already printed for this page */
+#define	PR_RETIRED	0x80	/* This page has been retired */
+
+#define	PR_REASONS	(PR_UE | PR_MCE | PR_FMA)
+#define	PR_TOXIC	(PR_UE)
+#define	PR_ERRMASK	(PR_UE | PR_UE_SCRUBBED | PR_MCE | PR_FMA)
+#define	PR_ALLFLAGS	(0xFF)
+
+#define	PP_RETIRED(pp)	((pp)->p_toxic & PR_RETIRED)
+#define	PP_TOXIC(pp)	((pp)->p_toxic & PR_TOXIC)
+#define	PP_PR_REQ(pp)	(((pp)->p_toxic & PR_REASONS) && !PP_RETIRED(pp))
 
 /*
  * kpm large page description.
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
index 9a2d12dd8e..d34f7b2737 100644
--- a/usr/src/uts/common/vm/page_lock.c
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -189,16 +189,17 @@ uint_t	page_lock_reclaim;
 uint_t	page_lock_bad_reclaim;
 uint_t	page_lock_same_page;
 uint_t	page_lock_upgrade;
+uint_t	page_lock_retired;
 uint_t	page_lock_upgrade_failed;
 uint_t	page_lock_deleted;
 
 uint_t	page_trylock_locked;
+uint_t	page_trylock_failed;
 uint_t	page_trylock_missed;
 
 uint_t	page_try_reclaim_upgrade;
 #endif /* VM_STATS */
 
-
 /*
  * Acquire the "shared/exclusive" lock on a page.
  *
@@ -222,27 +223,47 @@ page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
  * callers wanting an exclusive (writer) lock may prevent shared-lock
  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
  * In this case, when an exclusive lock cannot be acquired, p_selock's
- * SE_EWANTED bit is set.
- * This bit, along with the se and es parameters, are used to decide
- * if the requested lock should be granted:
+ * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
+ * if the page is slated for retirement.
+ *
+ * The se and es parameters determine if the lock should be granted
+ * based on the following decision table:
+ *
+ * Lock wanted   es flags     p_selock/SE_EWANTED  Action
+ * ----------- -------------- -------------------  ---------
+ * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
+ * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
+ * SE_EXCL        none         any lock/any        deny
+ * SE_SHARED      n/a [2][3]     shared/0          grant
+ * SE_SHARED      n/a [2][3]   unlocked/0          grant
+ * SE_SHARED      n/a            shared/1          deny
+ * SE_SHARED      n/a          unlocked/1          deny
+ * SE_SHARED      n/a              excl/any        deny
+ *
+ * Notes:
+ * [1] The code grants an exclusive lock to the caller and clears the bit
+ *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
+ *   bit's value.  This was deemed acceptable as we are not concerned about
+ *   exclusive-lock starvation. If this ever becomes an issue, a priority or
+ *   fifo mechanism should also be implemented. Meantime, the thread that
+ *   set SE_EWANTED should be prepared to catch this condition and reset it
+ *
+ * [2] Retired pages may not be locked at any time, regardless of the
+ *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
  *
- * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED  Action
- * ----------  -------------- -------------------  ---------
- * SE_EXCL        no           dont-care/1         deny lock
- * SE_EXCL     any(see note)   unlocked/any        grant lock, clear SE_EWANTED
- * SE_EXCL        yes          any lock/any        deny, set SE_EWANTED
- * SE_EXCL        no           any lock/any        deny
- * SE_SHARED   not applicable    shared/0          grant
- * SE_SHARED   not applicable  unlocked/0          grant
- * SE_SHARED   not applicable    shared/1          deny
- * SE_SHARED   not applicable  unlocked/1          deny
- * SE_SHARED   not applicable      excl/any        deny
+ * [3] If the page is slated for retirement the lock is denied.
  *
- * Note: the code grants an exclusive lock to the caller and clears
- * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
- * bit's value.  This was deemed acceptable as we are not concerned about
- * exclusive-lock starvation. If this ever becomes an issue, a priority or
- * fifo mechanism should also be implemented.
+ * Notes on values of "es":
+ *
+ *   es & 1: page_lookup_create will attempt page relocation
+ *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
+ *       memory thread); this prevents reader-starvation of waiting
+ *       writer thread(s) by giving priority to writers over readers.
+ *   es & SE_RETIRED: caller wants to lock pages even if they are
+ *       retired.  Default is to deny the lock if the page is retired.
+ *
+ * And yes, we know, the semantics of this function are too complicated.
+ * It's on the list to be cleaned up.
  */
 int
 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
@@ -261,17 +282,14 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
 
 	mutex_enter(pse);
 
-	/*
-	 * Current uses of 'es':
-	 * es == 1 page_lookup_create will attempt page relocation
-	 * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete
-	 * memory thread); this prevents reader-starvation of waiting
-	 * writer thread(s).
-	 */
-
-
 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
-	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
+		mutex_exit(pse);
+		VM_STAT_ADD(page_lock_retired);
+		return (0);
+	}
 
 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
 		se = SE_EXCL;
@@ -312,7 +330,7 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
 	}
 
 	if (se == SE_EXCL) {
-		if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
+		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
 			/*
 			 * if the caller wants a writer lock (but did not
 			 * specify exclusive access), and there is a pending
@@ -327,7 +345,7 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
 			retval = 1;
 		} else {
 			/* page is locked */
-			if (es == SE_EXCL_WANTED) {
+			if (es & SE_EXCL_WANTED) {
 				/* set the SE_EWANTED bit */
 				pp->p_selock |= SE_EWANTED;
 			}
@@ -336,10 +354,17 @@ page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
 	} else {
 		retval = 0;
 		if (pp->p_selock >= 0) {
-			/* readers are not allowed when excl wanted */
-			if (!(pp->p_selock & SE_EWANTED)) {
-				pp->p_selock += SE_READER;
-				retval = 1;
+			/*
+			 * Readers are not allowed when excl wanted or
+			 * a retire is pending. Since kvp pages can take
+			 * a long time to be retired, we make an exception
+			 * for them to avoid hanging threads unnecessarily.
+			 */
+			if ((pp->p_selock & SE_EWANTED) == 0) {
+				if (!PP_PR_REQ(pp) || pp->p_vnode == &kvp) {
+					pp->p_selock += SE_READER;
+					retval = 1;
+				}
 			}
 		}
 	}
@@ -468,7 +493,13 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
 	old = pp->p_selock;
 
 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
-	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
+	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
+
+	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
+		mutex_exit(pse);
+		VM_STAT_ADD(page_trylock_failed);
+		return (0);
+	}
 
 	if (se == SE_SHARED && es == 1 && old == 0) {
 		se = SE_EXCL;
@@ -477,11 +508,20 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
 	if (se == SE_SHARED) {
 		if (!PP_ISFREE(pp)) {
 			if (old >= 0) {
-				/* readers are not allowed when excl wanted */
-				if (!(old & SE_EWANTED)) {
-					pp->p_selock = old + SE_READER;
-					mutex_exit(pse);
-					return (1);
+				/*
+				 * Readers are not allowed when excl wanted
+				 * or a retire is pending. Since kvp pages can
+				 * take a long time to be retired, we make an
+				 * exception for them to avoid hanging threads
+				 * unnecessarily.
+				 */
+				if ((old & SE_EWANTED) == 0) {
+					if (!PP_PR_REQ(pp) ||
+					    pp->p_vnode == &kvp) {
+						pp->p_selock = old + SE_READER;
+						mutex_exit(pse);
+						return (1);
+					}
 				}
 			}
 			mutex_exit(pse);
@@ -498,7 +538,7 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
 	 * SE_EWANTED is not set, or if the caller specified
 	 * SE_EXCL_WANTED.
 	 */
-	if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) {
+	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
 		if ((old & ~SE_EWANTED) == 0) {
 			/* no reader/writer lock held */
 			THREAD_KPRI_REQUEST();
@@ -508,7 +548,7 @@ page_try_reclaim_lock(page_t *pp, se_t se, int es)
 			return (1);
 		}
 	}
-	if (es == SE_EXCL_WANTED) {
+	if (es & SE_EXCL_WANTED) {
 		/* page is locked, set the SE_EWANTED bit */
 		pp->p_selock |= SE_EWANTED;
 	}
@@ -526,9 +566,15 @@ page_trylock(page_t *pp, se_t se)
 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
 
 	mutex_enter(pse);
-	if (pp->p_selock & SE_EWANTED) {
-		/* fail if a thread wants exclusive access */
+	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
+	    (se == SE_SHARED && PP_PR_REQ(pp) && pp->p_vnode != &kvp)) {
+		/*
+		 * Fail if a thread wants exclusive access and page is
+		 * retired, if the page is slated for retirement, or a
+		 * share lock is requested.
+		 */
 		mutex_exit(pse);
+		VM_STAT_ADD(page_trylock_failed);
 		return (0);
 	}
 
@@ -551,6 +597,41 @@ page_trylock(page_t *pp, se_t se)
 }
 
 /*
+ * Variant of page_unlock() specifically for the page freelist
+ * code. The mere existence of this code is a vile hack that
+ * has resulted due to the backwards locking order of the page
+ * freelist manager; please don't call it.
+ */
+void
+page_unlock_noretire(page_t *pp)
+{
+	kmutex_t *pse = PAGE_SE_MUTEX(pp);
+	selock_t old;
+
+	mutex_enter(pse);
+
+	old = pp->p_selock;
+	if ((old & ~SE_EWANTED) == SE_READER) {
+		pp->p_selock = old & ~SE_READER;
+		if (CV_HAS_WAITERS(&pp->p_cv))
+			cv_broadcast(&pp->p_cv);
+	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
+		panic("page_unlock_noretire: page %p is deleted", pp);
+	} else if (old < 0) {
+		THREAD_KPRI_RELEASE();
+		pp->p_selock &= SE_EWANTED;
+		if (CV_HAS_WAITERS(&pp->p_cv))
+			cv_broadcast(&pp->p_cv);
+	} else if ((old & ~SE_EWANTED) > SE_READER) {
+		pp->p_selock = old - SE_READER;
+	} else {
+		panic("page_unlock_noretire: page %p is not locked", pp);
+	}
+
+	mutex_exit(pse);
+}
+
+/*
  * Release the page's "shared/exclusive" lock and wake up anyone
  * who might be waiting for it.
  */
@@ -561,6 +642,7 @@ page_unlock(page_t *pp)
 	selock_t old;
 
 	mutex_enter(pse);
+
 	old = pp->p_selock;
 	if ((old & ~SE_EWANTED) == SE_READER) {
 		pp->p_selock = old & ~SE_READER;
@@ -578,7 +660,29 @@ page_unlock(page_t *pp)
 	} else {
 		panic("page_unlock: page %p is not locked", pp);
 	}
-	mutex_exit(pse);
+
+	if (pp->p_selock == 0 && PP_PR_REQ(pp)) {
+		/*
+		 * Try to retire the page. If it retires, great.
+		 * If not, oh well, we'll get it in the next unlock
+		 * request, and repeat the cycle.  Regardless,
+		 * page_tryretire() will drop the page lock.
+		 */
+		if ((pp->p_toxic & PR_BUSY) == 0) {
+			THREAD_KPRI_REQUEST();
+			pp->p_selock = SE_WRITER;
+			page_settoxic(pp, PR_BUSY);
+			mutex_exit(pse);
+			page_tryretire(pp);
+		} else {
+			pp->p_selock = SE_WRITER;
+			page_clrtoxic(pp, PR_BUSY);
+			pp->p_selock = 0;
+			mutex_exit(pse);
+		}
+	} else {
+		mutex_exit(pse);
+	}
 }
 
 /*
diff --git a/usr/src/uts/common/vm/page_retire.c b/usr/src/uts/common/vm/page_retire.c
new file mode 100644
index 0000000000..30b218c15d
--- /dev/null
+++ b/usr/src/uts/common/vm/page_retire.c
@@ -0,0 +1,1473 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Page Retire - Big Theory Statement.
+ *
+ * This file handles removing sections of faulty memory from use when the
+ * user land FMA Diagnosis Engine requests that a page be removed or when
+ * a CE or UE is detected by the hardware.
+ *
+ * In the bad old days, the kernel side of Page Retire did a lot of the work
+ * on its own. Now, with the DE keeping track of errors, the kernel side is
+ * rather simple minded on most platforms.
+ *
+ * Errors are all reflected to the DE, and after digesting the error and
+ * looking at all previously reported errors, the DE decides what should
+ * be done about the current error. If the DE wants a particular page to
+ * be retired, then the kernel page retire code is invoked via an ioctl.
+ * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling
+ * page retire to handle the error. Since page retire is just a simple
+ * mechanism it doesn't need to differentiate between the different callers.
+ *
+ * The p_toxic field in the page_t is used to indicate which errors have
+ * occurred and what action has been taken on a given page. Because errors are
+ * reported without regard to the locked state of a page, no locks are used
+ * to SET the error bits in p_toxic. However, in order to clear the error
+ * bits, the page_t must be held exclusively locked.
+ *
+ * When page_retire() is called, it must be able to acquire locks, sleep, etc.
+ * It must not be called from high-level interrupt context.
+ *
+ * Depending on how the requested page is being used at the time of the retire
+ * request (and on the availability of sufficient system resources), the page
+ * may be retired immediately, or just marked for retirement later. For
+ * example, locked pages are marked, while free pages are retired. Multiple
+ * requests may be made to retire the same page, although there is no need
+ * to: once the p_toxic flags are set, the page will be retired as soon as it
+ * can be exclusively locked.
+ *
+ * The retire mechanism is driven centrally out of page_unlock(). To expedite
+ * the retirement of pages, further requests for SE_SHARED locks are denied
+ * as long as a page retirement is pending. In addition, as long as pages are
+ * pending retirement a background thread runs periodically trying to retire
+ * those pages. Pages which could not be retired while the system is running
+ * are scrubbed prior to rebooting to avoid latent errors on the next boot.
+ *
+ * Single CE pages and UE pages without persistent errors are scrubbed and
+ * returned to service. Recidivist pages, as well as FMA-directed requests
+ * for retirement, result in the page being taken out of service. Once the
+ * decision is made to take a page out of service, the page is cleared, hashed
+ * onto the retired_pages vnode, marked as retired, and it is unlocked.  No
+ * other requesters (except for unretire) are allowed to lock retired pages.
+ *
+ * The public routines return (sadly) 0 if they worked and a non-zero error
+ * value if something went wrong. This is done for the ioctl side of the
+ * world to allow errors to be reflected all the way out to user land. The
+ * non-zero values are explained in comments atop each function.
+ */
+
+/*
+ * Things to fix:
+ *
+ * 	1. Cleanup SE_EWANTED.  Since we're aggressive about trying to retire
+ *	pages, we can use page_retire_pp() to replace SE_EWANTED and all
+ *	the special delete_memory_thread() code just goes away.
+ *
+ * 	2. Trying to retire non-relocatable kvp pages may result in a
+ *      quagmire. This is because seg_kmem() no longer keeps its pages locked,
+ *      and calls page_lookup() in the free path; since kvp pages are modified
+ *      and don't have a usable backing store, page_retire() can't do anything
+ *      with them, and we'll keep denying the lock to seg_kmem_free() in a
+ *      vicious cycle. To prevent that, we don't deny locks to kvp pages, and
+ *      hence only call page_retire_pp() from page_unlock() in the free path.
+ *      Since most kernel pages are indefinitely held anyway, and don't
+ *      participate in I/O, this is of little consequence.
+ *
+ *      3. Low memory situations will be interesting. If we don't have
+ *      enough memory for page_relocate() to succeed, we won't be able to
+ *      retire dirty pages; nobody will be able to push them out to disk
+ *      either, since we aggressively deny the page lock. We could change
+ *      fsflush so it can recognize this situation, grab the lock, and push
+ *      the page out, where we'll catch it in the free path and retire it.
+ *
+ *	4. Beware of places that have code like this in them:
+ *
+ *		if (! page_tryupgrade(pp)) {
+ *			page_unlock(pp);
+ *			while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) {
+ *				/ *NOTHING* /
+ *			}
+ *		}
+ *		page_free(pp);
+ *
+ *	The problem is that pp can change identity right after the
+ *	page_unlock() call.  In particular, page_retire() can step in
+ *	there, change pp's identity, and hash pp onto the retired_vnode.
+ *
+ *	Of course, other functions besides page_retire() can have the
+ *	same effect. A kmem reader can waltz by, set up a mapping to the
+ *	page, and then unlock the page. Page_free() will then go castors
+ *	up. So if anybody is doing this, it's already a bug.
+ *
+ *      5. mdboot()'s call into page_retire_hunt() should probably be
+ *      moved lower. Where the call is made now, we can get into trouble
+ *      by scrubbing a kernel page that is then accessed later.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/ksynch.h>
+#include <sys/thread.h>
+#include <sys/disp.h>
+#include <sys/ontrap.h>
+#include <sys/vmsystm.h>
+#include <sys/mem_config.h>
+#include <sys/atomic.h>
+#include <sys/callb.h>
+#include <vm/page.h>
+#include <vm/vm_dep.h>
+#include <vm/as.h>
+#include <vm/hat.h>
+
+/*
+ * vnode for all pages which are retired from the VM system;
+ */
+vnode_t *retired_pages;
+
+/*
+ * Background thread that wakes up periodically to try to retire pending
+ * pages. This prevents threads from becoming blocked indefinitely in
+ * page_lookup() or some other routine should the page(s) they are waiting
+ * on become eligible for social security.
+ */
+static void page_retire_thread(void);
+static kthread_t *pr_thread_id;
+static kcondvar_t pr_cv;
+static kmutex_t pr_thread_mutex;
+static clock_t pr_thread_shortwait;
+static clock_t pr_thread_longwait;
+
+/*
+ * Make a list of all of the pages that have been marked for retirement
+ * but are not yet retired.  At system shutdown, we will scrub all of the
+ * pages in the list in case there are outstanding UEs.  Then, we
+ * cross-check this list against the number of pages that are yet to be
+ * retired, and if we find inconsistencies, we scan every page_t in the
+ * whole system looking for any pages that need to be scrubbed for UEs.
+ * The background thread also uses this queue to determine which pages
+ * it should keep trying to retire.
+ */
+#ifdef	DEBUG
+#define	PR_PENDING_QMAX	32
+#else	/* DEBUG */
+#define	PR_PENDING_QMAX	256
+#endif	/* DEBUG */
+page_t		*pr_pending_q[PR_PENDING_QMAX];
+kmutex_t	pr_q_mutex;
+
+/*
+ * Page retire global kstats
+ */
+struct page_retire_kstat {
+	kstat_named_t	pr_retired;
+	kstat_named_t	pr_requested;
+	kstat_named_t	pr_requested_free;
+	kstat_named_t	pr_enqueue_fail;
+	kstat_named_t	pr_dequeue_fail;
+	kstat_named_t	pr_pending;
+	kstat_named_t	pr_failed;
+	kstat_named_t	pr_failed_kernel;
+	kstat_named_t	pr_limit;
+	kstat_named_t	pr_limit_exceeded;
+	kstat_named_t	pr_fma;
+	kstat_named_t	pr_mce;
+	kstat_named_t	pr_ue;
+	kstat_named_t	pr_ue_cleared_retire;
+	kstat_named_t	pr_ue_cleared_free;
+	kstat_named_t	pr_ue_persistent;
+	kstat_named_t	pr_unretired;
+};
+
+static struct page_retire_kstat page_retire_kstat = {
+	{ "pages_retired",		KSTAT_DATA_UINT64},
+	{ "pages_retire_request",	KSTAT_DATA_UINT64},
+	{ "pages_retire_request_free",	KSTAT_DATA_UINT64},
+	{ "pages_notenqueued", 		KSTAT_DATA_UINT64},
+	{ "pages_notdequeued", 		KSTAT_DATA_UINT64},
+	{ "pages_pending", 		KSTAT_DATA_UINT64},
+	{ "pages_deferred",		KSTAT_DATA_UINT64},
+	{ "pages_deferred_kernel",	KSTAT_DATA_UINT64},
+	{ "pages_limit",		KSTAT_DATA_UINT64},
+	{ "pages_limit_exceeded",	KSTAT_DATA_UINT64},
+	{ "pages_fma",			KSTAT_DATA_UINT64},
+	{ "pages_multiple_ce",		KSTAT_DATA_UINT64},
+	{ "pages_ue",			KSTAT_DATA_UINT64},
+	{ "pages_ue_cleared_retired",	KSTAT_DATA_UINT64},
+	{ "pages_ue_cleared_freed",	KSTAT_DATA_UINT64},
+	{ "pages_ue_persistent",	KSTAT_DATA_UINT64},
+	{ "pages_unretired",		KSTAT_DATA_UINT64},
+};
+
+static kstat_t  *page_retire_ksp = NULL;
+
+#define	PR_INCR_KSTAT(stat)	\
+	atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1)
+#define	PR_DECR_KSTAT(stat)	\
+	atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1)
+
+#define	PR_KSTAT_RETIRED_CE	(page_retire_kstat.pr_mce.value.ui64)
+#define	PR_KSTAT_RETIRED_FMA	(page_retire_kstat.pr_fma.value.ui64)
+#define	PR_KSTAT_RETIRED_NOTUE	(PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA)
+#define	PR_KSTAT_PENDING	(page_retire_kstat.pr_pending.value.ui64)
+#define	PR_KSTAT_EQFAIL		(page_retire_kstat.pr_enqueue_fail.value.ui64)
+#define	PR_KSTAT_DQFAIL		(page_retire_kstat.pr_dequeue_fail.value.ui64)
+
+/*
+ * Limit the number of multiple CE page retires.
+ * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in
+ * basis points, where 100 basis points equals one percent.
+ */
+#define	MCE_BPT	10
+uint64_t	max_pages_retired_bps = MCE_BPT;
+#define	PAGE_RETIRE_LIMIT	((physmem * max_pages_retired_bps) / 10000)
+
+/*
+ * Control over the verbosity of page retirement.
+ *
+ * When set to zero (the default), no messages will be printed.
+ * When set to one, summary messages will be printed.
+ * When set > one, all messages will be printed.
+ *
+ * A value of one will trigger detailed messages for retirement operations,
+ * and is intended as a platform tunable for processors where FMA's DE does
+ * not run (e.g., spitfire). Values > one are intended for debugging only.
+ */
+int page_retire_messages = 0;
+
+/*
+ * Control whether or not we retire dirty UE pages. By default we do
+ * since we assume the data is corrupt and the process(es) using it will
+ * be killed. This is platform tunable only, and should probably not be
+ * changed, ever.
+ */
+int page_retire_modified = 1;
+
+/*
+ * Control whether or not we return scrubbed UE pages to service.
+ * By default we do not since FMA wants to run its diagnostics first
+ * and then ask us to unretire the page if it passes. Non-FMA platforms
+ * may set this to zero so we will only retire recidivist pages. It should
+ * not be changed by the user.
+ */
+int page_retire_first_ue = 1;
+
+/*
+ * Master enable for page retire. This prevents a CE or UE early in boot
+ * from trying to retire a page before page_retire_init() has finished
+ * setting things up. This is internal only and is not a tunable!
+ */
+static int pr_enable = 0;
+
+extern struct vnode kvp;
+
+#ifdef	DEBUG
+struct page_retire_debug {
+	int prd_dup;
+	int prd_noaction;
+	int prd_queued;
+	int prd_notqueued;
+	int prd_dequeue;
+	int prd_top;
+	int prd_locked;
+	int prd_reloc;
+	int prd_modce;
+	int prd_modue_fail;
+	int prd_modue_retire;
+	int prd_kern;
+	int prd_free;
+	int prd_noreclaim;
+	int prd_hashout;
+	int prd_fma;
+	int prd_uescrubbed;
+	int prd_uenotscrubbed;
+	int prd_mce;
+	int prd_prlocked;
+	int prd_prnotlocked;
+	int prd_prretired;
+	int prd_ulocked;
+	int prd_unotretired;
+	int prd_udestroy;
+	int prd_uhashout;
+	int prd_uunretired;
+	int prd_unotlocked;
+	int prd_checkhit;
+	int prd_checkmiss;
+	int prd_tctop;
+	int prd_tclocked;
+	int prd_hunt;
+	int prd_dohunt;
+	int prd_earlyhunt;
+	int prd_latehunt;
+	int prd_nofreedemote;
+	int prd_nodemote;
+	int prd_demoted;
+} pr_debug;
+
+#define	PR_DEBUG(foo)	((pr_debug.foo)++)
+
+/*
+ * A type histogram. We record the incidence of the various toxic
+ * flag combinations along with the interesting page attributes. The
+ * goal is to get as many combinations as we can while driving all
+ * pr_debug values nonzero (indicating we've exercised all possible
+ * code paths across all possible page types). Not all combinations
+ * will make sense -- e.g. PRT_MOD|PRT_KERNEL.
+ *
+ * pr_type offset bit encoding (when examining with a debugger):
+ *
+ *    PRT_NAMED  - 0x4
+ *    PRT_KERNEL - 0x8
+ *    PRT_FREE   - 0x10
+ *    PRT_MOD    - 0x20
+ *    PRT_FMA    - 0x0
+ *    PRT_MCE    - 0x40
+ *    PRT_UE     - 0x80
+ */
+
+#define	PRT_NAMED	0x01
+#define	PRT_KERNEL	0x02
+#define	PRT_FREE	0x04
+#define	PRT_MOD		0x08
+#define	PRT_FMA		0x00	/* yes, this is not a mistake */
+#define	PRT_MCE		0x10
+#define	PRT_UE		0x20
+#define	PRT_ALL		0x3F
+
+int pr_types[PRT_ALL+1];
+
+#define	PR_TYPES(pp)	{			\
+	int whichtype = 0;			\
+	if (pp->p_vnode)			\
+		whichtype |= PRT_NAMED;		\
+	if (pp->p_vnode == &kvp)		\
+		whichtype |= PRT_KERNEL;	\
+	if (PP_ISFREE(pp))			\
+		whichtype |= PRT_FREE;		\
+	if (hat_ismod(pp))			\
+		whichtype |= PRT_MOD;		\
+	if (pp->p_toxic & PR_UE)		\
+		whichtype |= PRT_UE;		\
+	if (pp->p_toxic & PR_MCE)		\
+		whichtype |= PRT_MCE;		\
+	pr_types[whichtype]++;			\
+}
+
+int recl_calls;
+int recl_mtbf = 3;
+int reloc_calls;
+int reloc_mtbf = 7;
+int pr_calls;
+int pr_mtbf = 15;
+
+#define	MTBF(v, f)	(((++(v)) & (f)) != (f))
+
+#else	/* DEBUG */
+
+#define	PR_DEBUG(foo)	/* nothing */
+#define	PR_TYPES(foo)	/* nothing */
+#define	MTBF(v, f)	(1)
+
+#endif	/* DEBUG */
+
+/*
+ * page_retire_done() - completion processing
+ *
+ * Used by the page_retire code for common completion processing.
+ * It keeps track of how many times a given result has happened,
+ * and writes out an occasional message.
+ *
+ * May be called with a NULL pp (PRD_INVALID_PA case).
+ */
+#define	PRD_INVALID_KEY		-1
+#define	PRD_SUCCESS		0
+#define	PRD_PENDING		1
+#define	PRD_FAILED		2
+#define	PRD_DUPLICATE		3
+#define	PRD_INVALID_PA		4
+#define	PRD_LIMIT		5
+#define	PRD_UE_SCRUBBED		6
+#define	PRD_UNR_SUCCESS		7
+#define	PRD_UNR_CANTLOCK	8
+#define	PRD_UNR_NOT		9
+
+typedef struct page_retire_op {
+	int	pr_key;		/* one of the PRD_* defines from above */
+	int	pr_count;	/* How many times this has happened */
+	int	pr_retval;	/* return value */
+	int	pr_msglvl;	/* message level - when to print */
+	char	*pr_message;	/* Cryptic message for field service */
+} page_retire_op_t;
+
+static page_retire_op_t page_retire_ops[] = {
+	/* key			count	retval	msglvl	message */
+	{PRD_SUCCESS,		0,	0,	1,
+		"Page 0x%08x.%08x removed from service"},
+	{PRD_PENDING,		0,	EAGAIN,	2,
+		"Page 0x%08x.%08x will be retired on free"},
+	{PRD_FAILED,		0,	EAGAIN,	0, NULL},
+	{PRD_DUPLICATE,		0,	EBUSY,	2,
+		"Page 0x%08x.%08x already retired"},
+	{PRD_INVALID_PA,	0,	EINVAL, 2,
+		"PA 0x%08x.%08x is not a relocatable page"},
+	{PRD_LIMIT,		0,	0,	1,
+		"Page 0x%08x.%08x not retired due to limit exceeded"},
+	{PRD_UE_SCRUBBED,	0,	0,	1,
+		"Previously reported error on page 0x%08x.%08x cleared"},
+	{PRD_UNR_SUCCESS,	0,	0,	1,
+		"Page 0x%08x.%08x returned to service"},
+	{PRD_UNR_CANTLOCK,	0,	EAGAIN,	2,
+		"Page 0x%08x.%08x could not be unretired"},
+	{PRD_UNR_NOT,		0,	EBADF,	2,
+		"Page 0x%08x.%08x is not retired"},
+	{PRD_INVALID_KEY,	0,	0,	0, NULL} /* MUST BE LAST! */
+};
+
+/*
+ * print a message if page_retire_messages is true.
+ */
+#define	PR_MESSAGE(debuglvl, msglvl, msg, pa)				\
+{									\
+	uint64_t p = (uint64_t)pa;					\
+	if (page_retire_messages >= msglvl && msg != NULL) {		\
+		cmn_err(debuglvl, msg,					\
+		    (uint32_t)(p >> 32), (uint32_t)p);			\
+	}								\
+}
+
+/*
+ * Note that multiple bits may be set in a single settoxic operation.
+ * May be called without the page locked.
+ */
+void
+page_settoxic(page_t *pp, uchar_t bits)
+{
+	atomic_or_8(&pp->p_toxic, bits);
+}
+
+/*
+ * Note that multiple bits may cleared in a single clrtoxic operation.
+ * Must be called with the page exclusively locked.
+ */
+void
+page_clrtoxic(page_t *pp, uchar_t bits)
+{
+	ASSERT(PAGE_EXCL(pp));
+	atomic_and_8(&pp->p_toxic, ~bits);
+}
+
+/*
+ * Prints any page retire messages to the user, and decides what
+ * error code is appropriate for the condition reported.
+ */
+static int
+page_retire_done(page_t *pp, int code)
+{
+	page_retire_op_t *prop;
+	uint64_t	pa = 0;
+	int		i;
+
+	if (pp != NULL) {
+		pa = mmu_ptob(pp->p_pagenum);
+	}
+
+	prop = NULL;
+	for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) {
+		if (page_retire_ops[i].pr_key == code) {
+			prop = &page_retire_ops[i];
+			break;
+		}
+	}
+
+#ifdef	DEBUG
+	if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) {
+		cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code);
+	}
+#endif
+
+	ASSERT(prop->pr_key == code);
+
+	prop->pr_count++;
+
+	PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa);
+	if (pp != NULL) {
+		page_settoxic(pp, PR_MSG);
+	}
+
+	return (prop->pr_retval);
+}
+
+/*
+ * On a reboot, our friend mdboot() wants to clear up any PP_PR_REQ() pages
+ * that we were not able to retire. On large machines, walking the complete
+ * page_t array and looking at every page_t takes too long. So, as a page is
+ * marked toxic, we track it using a list that can be processed at reboot
+ * time.  page_retire_enqueue() will do its best to try to avoid duplicate
+ * entries, but if we get too many errors at once the queue can overflow,
+ * in which case we will end up walking every page_t as a last resort.
+ * The background thread also makes use of this queue to find which pages
+ * are pending retirement.
+ */
+static void
+page_retire_enqueue(page_t *pp)
+{
+	int	nslot = -1;
+	int	i;
+
+	mutex_enter(&pr_q_mutex);
+
+	/*
+	 * Check to make sure retire hasn't already dequeued it.
+	 * In the meantime if the page was cleaned up, no need
+	 * to enqueue it.
+	 */
+	if (PP_RETIRED(pp) || pp->p_toxic == 0) {
+		mutex_exit(&pr_q_mutex);
+		PR_DEBUG(prd_noaction);
+		return;
+	}
+
+	for (i = 0; i < PR_PENDING_QMAX; i++) {
+		if (pr_pending_q[i] == pp) {
+			mutex_exit(&pr_q_mutex);
+			PR_DEBUG(prd_dup);
+			return;
+		} else if (nslot == -1 && pr_pending_q[i] == NULL) {
+			nslot = i;
+		}
+	}
+
+	PR_INCR_KSTAT(pr_pending);
+
+	if (nslot != -1) {
+		pr_pending_q[nslot] = pp;
+		PR_DEBUG(prd_queued);
+	} else {
+		PR_INCR_KSTAT(pr_enqueue_fail);
+		PR_DEBUG(prd_notqueued);
+	}
+	mutex_exit(&pr_q_mutex);
+}
+
+static void
+page_retire_dequeue(page_t *pp)
+{
+	int i;
+
+	mutex_enter(&pr_q_mutex);
+
+	for (i = 0; i < PR_PENDING_QMAX; i++) {
+		if (pr_pending_q[i] == pp) {
+			pr_pending_q[i] = NULL;
+			break;
+		}
+	}
+
+	if (i == PR_PENDING_QMAX) {
+		PR_INCR_KSTAT(pr_dequeue_fail);
+	}
+
+	PR_DECR_KSTAT(pr_pending);
+	PR_DEBUG(prd_dequeue);
+
+	mutex_exit(&pr_q_mutex);
+}
+
+/*
+ * Act like page_destroy(), but instead of freeing the page, hash it onto
+ * the retired_pages vnode, and mark it retired.
+ *
+ * For fun, we try to scrub the page until it's squeaky clean.
+ * availrmem is adjusted here.
+ */
+static void
+page_retire_destroy(page_t *pp)
+{
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(!PP_ISFREE(pp));
+	ASSERT(pp->p_szc == 0);
+	ASSERT(!hat_page_is_mapped(pp));
+	ASSERT(!pp->p_vnode);
+
+	page_clr_all_props(pp);
+	pagescrub(pp, 0, MMU_PAGESIZE);
+
+	pp->p_next = NULL;
+	pp->p_prev = NULL;
+	if (page_hashin(pp, retired_pages, (u_offset_t)pp, NULL) == 0) {
+		cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp);
+	}
+
+	page_settoxic(pp, PR_RETIRED);
+	page_clrtoxic(pp, PR_BUSY);
+	page_retire_dequeue(pp);
+	PR_INCR_KSTAT(pr_retired);
+
+	if (pp->p_toxic & PR_FMA) {
+		PR_INCR_KSTAT(pr_fma);
+	} else if (pp->p_toxic & PR_UE) {
+		PR_INCR_KSTAT(pr_ue);
+	} else {
+		PR_INCR_KSTAT(pr_mce);
+	}
+
+	mutex_enter(&freemem_lock);
+	availrmem--;
+	mutex_exit(&freemem_lock);
+
+	page_unlock(pp);
+}
+
+/*
+ * Check whether the number of pages which have been retired already exceeds
+ * the maximum allowable percentage of memory which may be retired.
+ *
+ * Returns 1 if the limit has been exceeded.
+ */
+static int
+page_retire_limit(void)
+{
+	if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) {
+		PR_INCR_KSTAT(pr_limit_exceeded);
+		return (1);
+	}
+
+	return (0);
+}
+
+#define	MSG_DM	"Data Mismatch occurred at PA 0x%08x.%08x"		\
+	"[ 0x%x != 0x%x ] while attempting to clear previously "	\
+	"reported error; page removed from service"
+
+#define	MSG_UE	"Uncorrectable Error occurred at PA 0x%08x.%08x while "	\
+	"attempting to clear previously reported error; page removed "	\
+	"from service"
+
+/*
+ * Attempt to clear a UE from a page.
+ * Returns 1 if the error has been successfully cleared.
+ */
+static int
+page_clear_transient_ue(page_t *pp)
+{
+	caddr_t		kaddr;
+	uint8_t		rb, wb;
+	uint64_t	pa;
+	uint32_t	pa_hi, pa_lo;
+	on_trap_data_t	otd;
+	int		errors = 0;
+	int		i;
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(PP_PR_REQ(pp));
+	ASSERT(pp->p_szc == 0);
+	ASSERT(!hat_page_is_mapped(pp));
+
+	/*
+	 * Clear the page and attempt to clear the UE.  If we trap
+	 * on the next access to the page, we know the UE has recurred.
+	 */
+	pagescrub(pp, 0, PAGESIZE);
+
+	/*
+	 * Map the page and write a bunch of bit patterns to compare
+	 * what we wrote with what we read back.  This isn't a perfect
+	 * test but it should be good enough to catch most of the
+	 * recurring UEs. If this fails to catch a recurrent UE, we'll
+	 * retire the page the next time we see a UE on the page.
+	 */
+	kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1);
+
+	pa = ptob((uint64_t)page_pptonum(pp));
+	pa_hi = (uint32_t)(pa >> 32);
+	pa_lo = (uint32_t)pa;
+
+	/*
+	 * Fill the page with each (0x00 - 0xFF] bit pattern, flushing
+	 * the cache in between reading and writing.  We do this under
+	 * on_trap() protection to avoid recursion.
+	 */
+	if (on_trap(&otd, OT_DATA_EC)) {
+		PR_MESSAGE(CE_WARN, 1, MSG_UE, pa);
+		errors = 1;
+	} else {
+		for (wb = 0xff; wb > 0; wb--) {
+			for (i = 0; i < PAGESIZE; i++) {
+				kaddr[i] = wb;
+			}
+
+			sync_data_memory(kaddr, PAGESIZE);
+
+			for (i = 0; i < PAGESIZE; i++) {
+				rb = kaddr[i];
+				if (rb != wb) {
+					/*
+					 * We had a mismatch without a trap.
+					 * Uh-oh. Something is really wrong
+					 * with this system.
+					 */
+					if (page_retire_messages) {
+						cmn_err(CE_WARN, MSG_DM,
+						    pa_hi, pa_lo, rb, wb);
+					}
+					errors = 1;
+					goto out;	/* double break */
+				}
+			}
+		}
+	}
+out:
+	no_trap();
+	ppmapout(kaddr);
+
+	return (errors ? 0 : 1);
+}
+
+/*
+ * Try to clear a page_t with a single UE. If the UE was transient, it is
+ * returned to service, and we return 1. Otherwise we return 0 meaning
+ * that further processing is required to retire the page.
+ */
+static int
+page_retire_transient_ue(page_t *pp)
+{
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(!hat_page_is_mapped(pp));
+
+	/*
+	 * If this page is a repeat offender, retire him under the
+	 * "two strikes and you're out" rule. The caller is responsible
+	 * for scrubbing the page to try to clear the error.
+	 */
+	if (pp->p_toxic & PR_UE_SCRUBBED) {
+		PR_INCR_KSTAT(pr_ue_persistent);
+		return (0);
+	}
+
+	if (page_clear_transient_ue(pp)) {
+		/*
+		 * We set the PR_SCRUBBED_UE bit; if we ever see this
+		 * page again, we will retire it, no questions asked.
+		 */
+		page_settoxic(pp, PR_UE_SCRUBBED);
+
+		if (page_retire_first_ue) {
+			PR_INCR_KSTAT(pr_ue_cleared_retire);
+			return (0);
+		} else {
+			PR_INCR_KSTAT(pr_ue_cleared_free);
+
+			page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY);
+			page_retire_dequeue(pp);
+
+			/*
+			 * Clear the free bit if it's set, since the
+			 * page free code will get cranky if we don't.
+			 */
+			PP_CLRFREE(pp);
+
+			/* LINTED: CONSTCOND */
+			VN_DISPOSE(pp, B_FREE, 1, kcred);
+			return (1);
+		}
+	}
+
+	PR_INCR_KSTAT(pr_ue_persistent);
+	return (0);
+}
+
+/*
+ * Update the statistics dynamically when our kstat is read.
+ */
+static int
+page_retire_kstat_update(kstat_t *ksp, int rw)
+{
+	struct page_retire_kstat *pr;
+
+	if (ksp == NULL)
+	    return (EINVAL);
+
+	switch (rw) {
+
+	case KSTAT_READ:
+		pr = (struct page_retire_kstat *)ksp->ks_data;
+		ASSERT(pr == &page_retire_kstat);
+		pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT;
+		return (0);
+
+	case KSTAT_WRITE:
+		return (EACCES);
+
+	default:
+		return (EINVAL);
+	}
+	/*NOTREACHED*/
+}
+
+/*
+ * Initialize the page retire mechanism:
+ *
+ *   - Establish the correctable error retire limit.
+ *   - Initialize locks.
+ *   - Build the retired_pages vnode.
+ *   - Set up the kstats.
+ *   - Fire off the background thread.
+ *   - Tell page_tryretire() it's OK to start retiring pages.
+ */
+void
+page_retire_init(void)
+{
+	const fs_operation_def_t retired_vnodeops_template[] = {NULL, NULL};
+	struct vnodeops *vops;
+
+	const uint_t page_retire_ndata =
+	    sizeof (page_retire_kstat) / sizeof (kstat_named_t);
+
+	ASSERT(page_retire_ksp == NULL);
+
+	if (max_pages_retired_bps <= 0) {
+		max_pages_retired_bps = MCE_BPT;
+	}
+
+	mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL);
+
+	retired_pages = vn_alloc(KM_SLEEP);
+	if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) {
+		cmn_err(CE_PANIC,
+		    "page_retired_init: can't make retired vnodeops");
+	}
+	vn_setops(retired_pages, vops);
+
+	if ((page_retire_ksp = kstat_create("unix", 0, "page_retire",
+	    "misc", KSTAT_TYPE_NAMED, page_retire_ndata,
+	    KSTAT_FLAG_VIRTUAL)) == NULL) {
+		cmn_err(CE_WARN, "kstat_create for page_retire failed");
+	} else {
+		page_retire_ksp->ks_data = (void *)&page_retire_kstat;
+		page_retire_ksp->ks_update = page_retire_kstat_update;
+		kstat_install(page_retire_ksp);
+	}
+
+	pr_thread_shortwait = 23 * hz;
+	pr_thread_longwait = 1201 * hz;
+	mutex_init(&pr_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&pr_cv, NULL, CV_DEFAULT, NULL);
+	pr_thread_id = thread_create(NULL, 0, page_retire_thread, NULL, 0, &p0,
+	    TS_RUN, minclsyspri);
+
+	pr_enable = 1;
+}
+
+/*
+ * page_retire_hunt() callback for the retire thread.
+ */
+static void
+page_retire_thread_cb(page_t *pp)
+{
+	PR_DEBUG(prd_tctop);
+	if (pp->p_vnode != &kvp && page_trylock(pp, SE_EXCL)) {
+		PR_DEBUG(prd_tclocked);
+		page_unlock(pp);
+	}
+}
+
+/*
+ * page_retire_hunt() callback for mdboot().
+ *
+ * It is necessary to scrub any failing pages prior to reboot in order to
+ * prevent a latent error trap from occurring on the next boot.
+ */
+void
+page_retire_mdboot_cb(page_t *pp)
+{
+	/*
+	 * Don't scrub the kernel, since we might still need it, unless
+	 * we have UEs on the page, in which case we have nothing to lose.
+	 */
+	if (pp->p_vnode != &kvp || PP_TOXIC(pp)) {
+		pp->p_selock = -1;	/* pacify ASSERTs */
+		pagescrub(pp, 0, PAGESIZE);
+		pp->p_selock = 0;
+	}
+	pp->p_toxic = 0;
+}
+
+/*
+ * Hunt down any pages in the system that have not yet been retired, invoking
+ * the provided callback function on each of them.
+ */
+void
+page_retire_hunt(void (*callback)(page_t *))
+{
+	page_t *pp;
+	page_t *first;
+	int i, found;
+
+	PR_DEBUG(prd_hunt);
+
+	if (PR_KSTAT_PENDING == 0) {
+		return;
+	}
+
+	PR_DEBUG(prd_dohunt);
+
+	found = 0;
+	mutex_enter(&pr_q_mutex);
+
+	for (i = 0; i < PR_PENDING_QMAX; i++) {
+		if ((pp = pr_pending_q[i]) != NULL) {
+			mutex_exit(&pr_q_mutex);
+			callback(pp);
+			mutex_enter(&pr_q_mutex);
+			found++;
+		}
+	}
+
+	if (PR_KSTAT_EQFAIL == PR_KSTAT_DQFAIL && found == PR_KSTAT_PENDING) {
+		mutex_exit(&pr_q_mutex);
+		PR_DEBUG(prd_earlyhunt);
+		return;
+	}
+	mutex_exit(&pr_q_mutex);
+
+	PR_DEBUG(prd_latehunt);
+
+	/*
+	 * We've lost track of a page somewhere. Hunt it down.
+	 */
+	memsegs_lock(0);
+	pp = first = page_first();
+	do {
+		if (PP_PR_REQ(pp)) {
+			callback(pp);
+			if (++found == PR_KSTAT_PENDING) {
+				break;	/* got 'em all */
+			}
+		}
+	} while ((pp = page_next(pp)) != first);
+	memsegs_unlock(0);
+}
+
+/*
+ * The page_retire_thread loops forever, looking to see if there are
+ * pages still waiting to be retired.
+ */
+static void
+page_retire_thread(void)
+{
+	callb_cpr_t c;
+
+	CALLB_CPR_INIT(&c, &pr_thread_mutex, callb_generic_cpr, "page_retire");
+
+	mutex_enter(&pr_thread_mutex);
+	for (;;) {
+		if (pr_enable && PR_KSTAT_PENDING) {
+			kmem_reap();
+			seg_preap();
+			page_retire_hunt(page_retire_thread_cb);
+			CALLB_CPR_SAFE_BEGIN(&c);
+			(void) cv_timedwait(&pr_cv, &pr_thread_mutex,
+			    lbolt + pr_thread_shortwait);
+			CALLB_CPR_SAFE_END(&c, &pr_thread_mutex);
+		} else {
+			CALLB_CPR_SAFE_BEGIN(&c);
+			(void) cv_timedwait(&pr_cv, &pr_thread_mutex,
+			    lbolt + pr_thread_longwait);
+			CALLB_CPR_SAFE_END(&c, &pr_thread_mutex);
+		}
+	}
+	/*NOTREACHED*/
+}
+
+/*
+ * page_retire_pp() decides what to do with a failing page.
+ *
+ * When we get a free page (e.g. the scrubber or in the free path) life is
+ * nice because the page is clean and marked free -- those always retire
+ * nicely. From there we go by order of difficulty. If the page has data,
+ * we attempt to relocate its contents to a suitable replacement page. If
+ * that does not succeed, we look to see if it is clean. If after all of
+ * this we have a clean, unmapped page (which we usually do!), we retire it.
+ * If the page is not clean, we still process it regardless on a UE; for
+ * CEs or FMA requests, we fail leaving the page in service. The page will
+ * eventually be tried again later. We always return with the page unlocked
+ * since we are called from page_unlock().
+ *
+ * We don't call panic or do anything fancy down in here. Our boss the DE
+ * gets paid handsomely to do his job of figuring out what to do when errors
+ * occur. We just do what he tells us to do.
+ */
+static int
+page_retire_pp(page_t *pp)
+{
+	int		toxic;
+
+	ASSERT(PAGE_EXCL(pp));
+	ASSERT(pp->p_iolock_state == 0);
+	ASSERT(pp->p_szc == 0);
+
+	PR_DEBUG(prd_top);
+	PR_TYPES(pp);
+
+	toxic = pp->p_toxic;
+	ASSERT(toxic & PR_REASONS);
+
+	if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) &&
+	    page_retire_limit()) {
+		page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY);
+		page_retire_dequeue(pp);
+		page_unlock(pp);
+		return (page_retire_done(pp, PRD_LIMIT));
+	}
+
+	if (PP_ISFREE(pp)) {
+		PR_DEBUG(prd_free);
+		if (!MTBF(recl_calls, recl_mtbf) || !page_reclaim(pp, NULL)) {
+			PR_DEBUG(prd_noreclaim);
+			PR_INCR_KSTAT(pr_failed);
+			page_unlock(pp);
+			return (page_retire_done(pp, PRD_FAILED));
+		}
+	}
+
+	if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISFREE(pp) &&
+	    !PP_ISNORELOC(pp) && MTBF(reloc_calls, reloc_mtbf)) {
+		page_t *newpp;
+		spgcnt_t count;
+
+		/*
+		 * If we can relocate the page, great! newpp will go
+		 * on without us, and everything is fine.  Regardless
+		 * of whether the relocation succeeds, we are still
+		 * going to take `pp' around back and shoot it.
+		 */
+		PR_DEBUG(prd_reloc);
+		newpp = NULL;
+		if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) {
+			page_unlock(newpp);
+			ASSERT(hat_page_getattr(pp, P_MOD) == 0);
+		}
+	}
+
+	if (pp->p_vnode == &kvp) {
+		PR_DEBUG(prd_kern);
+		PR_INCR_KSTAT(pr_failed_kernel);
+		page_unlock(pp);
+		return (page_retire_done(pp, PRD_FAILED));
+	}
+
+	if (pp->p_lckcnt || pp->p_cowcnt) {
+		if (toxic & PR_UE) {
+			(void) page_clear_lck_cow(pp, 1);
+		} else {
+			PR_DEBUG(prd_locked);
+			PR_INCR_KSTAT(pr_failed);
+			page_unlock(pp);
+			return (page_retire_done(pp, PRD_FAILED));
+		}
+	}
+
+	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
+	ASSERT(!PP_ISFREE(pp));
+	ASSERT(!hat_page_is_mapped(pp));
+
+	/*
+	 * If the page is modified, was not relocated, and not toxic,
+	 * we can't retire it without dropping data on the floor.
+	 *
+	 * RFE: we could change fsflush so that it (and only it) will
+	 * be allowed to lock this page and push it out.  Once it cleans
+	 * the page, we'd then be able to retire it on the free path.
+	 * In practice, this should be exceedingly rare.
+	 */
+	if (hat_ismod(pp)) {
+		if ((toxic & PR_UE) == 0) {
+			PR_DEBUG(prd_modce);
+			PR_INCR_KSTAT(pr_failed);
+			page_unlock(pp);
+			return (page_retire_done(pp, PRD_FAILED));
+		} else if (page_retire_modified == 0) {
+			PR_DEBUG(prd_modue_fail);
+			PR_INCR_KSTAT(pr_failed);
+			page_unlock(pp);
+			return (page_retire_done(pp, PRD_FAILED));
+		}
+		PR_DEBUG(prd_modue_retire);
+	}
+
+	if (pp->p_vnode) {
+		PR_DEBUG(prd_hashout);
+		page_hashout(pp, NULL);
+	}
+	ASSERT(!pp->p_vnode);
+
+	/*
+	 * The problem page is locked, demoted, unmapped, not free,
+	 * hashed out, and not COW or mlocked (whew!).
+	 *
+	 * Now we select our ammunition, take it around back, and shoot it.
+	 */
+	if (toxic & PR_UE) {
+		if (hat_ismod(pp)) {
+			/*
+			 * Let the user know we are dropping their data
+			 * on the floor.
+			 */
+			PR_MESSAGE(CE_WARN, 1, "Removing modified page "
+			    "0x%08x.%08x from service",
+			    mmu_ptob(pp->p_pagenum));
+		}
+		if (page_retire_transient_ue(pp)) {
+			PR_DEBUG(prd_uescrubbed);
+			return (page_retire_done(pp, PRD_UE_SCRUBBED));
+		} else {
+			PR_DEBUG(prd_uenotscrubbed);
+			page_retire_destroy(pp);
+			return (page_retire_done(pp, PRD_SUCCESS));
+		}
+	} else if (toxic & PR_FMA) {
+		PR_DEBUG(prd_fma);
+		page_retire_destroy(pp);
+		return (page_retire_done(pp, PRD_SUCCESS));
+	} else if (toxic & PR_MCE) {
+		PR_DEBUG(prd_mce);
+		page_retire_destroy(pp);
+		return (page_retire_done(pp, PRD_SUCCESS));
+	}
+	panic("page_retire_pp: bad toxic flags %d", toxic);
+	/*NOTREACHED*/
+}
+
+/*
+ * Try to retire a page when we stumble onto it in the page lock routines.
+ */
+void
+page_tryretire(page_t *pp)
+{
+	ASSERT(PAGE_EXCL(pp));
+
+	if (!pr_enable) {
+		page_unlock(pp);
+		return;
+	}
+
+	/*
+	 * If the page is a big page, try to break it up.
+	 *
+	 * If there are other bad pages besides `pp', they will be
+	 * recursively retired for us thanks to a bit of magic.
+	 * If the page is a small page with errors, try to retire it.
+	 */
+	if (pp->p_szc > 0) {
+		if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) {
+			page_unlock(pp);
+			PR_DEBUG(prd_nofreedemote);
+			return;
+		} else if (!page_try_demote_pages(pp)) {
+			page_unlock(pp);
+			PR_DEBUG(prd_nodemote);
+			return;
+		}
+		PR_DEBUG(prd_demoted);
+		page_unlock(pp);
+	} else {
+		(void) page_retire_pp(pp);
+	}
+}
+
+/*
+ * page_retire() - the front door in to retire a page.
+ *
+ * Ideally, page_retire() would instantly retire the requested page.
+ * Unfortunately, some pages are locked or otherwise tied up and cannot be
+ * retired right away. To deal with that, bits are set in p_toxic of the
+ * page_t. An attempt is made to lock the page; if the attempt is successful,
+ * we instantly unlock the page counting on page_unlock() to notice p_toxic
+ * is nonzero and to call back into page_retire_pp(). Success is determined
+ * by looking to see whether the page has been retired once it has been
+ * unlocked.
+ *
+ * Returns:
+ *
+ *   - 0 on success,
+ *   - EINVAL when the PA is whacko,
+ *   - EBUSY if the page is already retired, or
+ *   - EAGAIN if the page could not be _immediately_ retired.
+ */
+int
+page_retire(uint64_t pa, uchar_t reason)
+{
+	page_t	*pp;
+
+	ASSERT(reason & PR_REASONS);		/* there must be a reason */
+	ASSERT(!(reason & ~PR_REASONS));	/* but no other bits */
+
+	pp = page_numtopp_nolock(mmu_btop(pa));
+	if (pp == NULL) {
+		PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on"
+		    " page 0x%08x.%08x; page is not relocatable memory", pa);
+		return (page_retire_done(pp, PRD_INVALID_PA));
+	}
+	if (PP_RETIRED(pp)) {
+		return (page_retire_done(pp, PRD_DUPLICATE));
+	}
+
+	if (reason & PR_UE) {
+		PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on"
+		    " page 0x%08x.%08x", pa);
+	} else {
+		PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
+		    " page 0x%08x.%08x", pa);
+	}
+	page_settoxic(pp, reason);
+	page_retire_enqueue(pp);
+
+	/*
+	 * And now for some magic.
+	 *
+	 * We marked this page toxic up above.  All there is left to do is
+	 * to try to lock the page and then unlock it.  The page lock routines
+	 * will intercept the page and retire it if they can.  If the page
+	 * cannot be locked, 's okay -- page_unlock() will eventually get it,
+	 * or the background thread, until then the lock routines will deny
+	 * further locks on it.
+	 */
+	if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) {
+		PR_DEBUG(prd_prlocked);
+		page_unlock(pp);
+	} else {
+		PR_DEBUG(prd_prnotlocked);
+	}
+
+	if (PP_RETIRED(pp)) {
+		PR_DEBUG(prd_prretired);
+		return (0);
+	} else {
+		cv_signal(&pr_cv);
+		PR_INCR_KSTAT(pr_failed);
+
+		if (pp->p_toxic & PR_MSG) {
+			return (page_retire_done(pp, PRD_FAILED));
+		} else {
+			return (page_retire_done(pp, PRD_PENDING));
+		}
+	}
+}
+
+/*
+ * Take a retired page off the retired-pages vnode and clear the toxic flags.
+ * If "free" is nonzero, lock it and put it back on the freelist. If "free"
+ * is zero, the caller already holds SE_EXCL lock so we simply unretire it
+ * and don't do anything else with it.
+ *
+ * Any unretire messages are printed from this routine.
+ *
+ * Returns 0 if page pp was unretired; else an error code.
+ */
+int
+page_unretire_pp(page_t *pp, int free)
+{
+	/*
+	 * To be retired, a page has to be hashed onto the retired_pages vnode
+	 * and have PR_RETIRED set in p_toxic.
+	 */
+	if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
+		ASSERT(PAGE_EXCL(pp));
+		PR_DEBUG(prd_ulocked);
+		if (!PP_RETIRED(pp)) {
+			PR_DEBUG(prd_unotretired);
+			page_unlock(pp);
+			return (page_retire_done(pp, PRD_UNR_NOT));
+		}
+
+		PR_MESSAGE(CE_NOTE, 1, "unretiring retired"
+		    " page 0x%08x.%08x", mmu_ptob(pp->p_pagenum));
+		if (pp->p_toxic & PR_FMA) {
+			PR_DECR_KSTAT(pr_fma);
+		} else if (pp->p_toxic & PR_UE) {
+			PR_DECR_KSTAT(pr_ue);
+		} else {
+			PR_DECR_KSTAT(pr_mce);
+		}
+		page_clrtoxic(pp, PR_ALLFLAGS);
+
+		if (free) {
+			PR_DEBUG(prd_udestroy);
+			page_destroy(pp, 0);
+		} else {
+			PR_DEBUG(prd_uhashout);
+			page_hashout(pp, NULL);
+		}
+
+		mutex_enter(&freemem_lock);
+		availrmem++;
+		mutex_exit(&freemem_lock);
+
+		PR_DEBUG(prd_uunretired);
+		PR_DECR_KSTAT(pr_retired);
+		PR_INCR_KSTAT(pr_unretired);
+		return (page_retire_done(pp, PRD_UNR_SUCCESS));
+	}
+	PR_DEBUG(prd_unotlocked);
+	return (page_retire_done(pp, PRD_UNR_CANTLOCK));
+}
+
+/*
+ * Return a page to service by moving it from the retired_pages vnode
+ * onto the freelist.
+ *
+ * Called from mmioctl_page_retire() on behalf of the FMA DE.
+ *
+ * Returns:
+ *
+ *   - 0 if the page is unretired,
+ *   - EAGAIN if the pp can not be locked,
+ *   - EINVAL if the PA is whacko, and
+ *   - EBADF if the pp is not retired.
+ */
+int
+page_unretire(uint64_t pa)
+{
+	page_t	*pp;
+
+	pp = page_numtopp_nolock(mmu_btop(pa));
+	if (pp == NULL) {
+		return (page_retire_done(pp, PRD_INVALID_PA));
+	}
+
+	return (page_unretire_pp(pp, 1));
+}
+
+/*
+ * Test a page to see if it is retired. If errors is non-NULL, the toxic
+ * bits of the page are returned. Returns 0 on success, error code on failure.
+ */
+int
+page_retire_check_pp(page_t *pp, uint64_t *errors)
+{
+	int rc;
+
+	if (PP_RETIRED(pp)) {
+		PR_DEBUG(prd_checkhit);
+		rc = 0;
+	} else {
+		PR_DEBUG(prd_checkmiss);
+		rc = EAGAIN;
+	}
+
+	/*
+	 * We have magically arranged the bit values returned to fmd(1M)
+	 * to line up with the FMA, MCE, and UE bits of the page_t.
+	 */
+	if (errors) {
+		uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK);
+		if (toxic & PR_UE_SCRUBBED) {
+			toxic &= ~PR_UE_SCRUBBED;
+			toxic |= PR_UE;
+		}
+		*errors = toxic;
+	}
+
+	return (rc);
+}
+
+/*
+ * Test to see if the page_t for a given PA is retired, and return the
+ * hardware errors we have seen on the page if requested.
+ *
+ * Called from mmioctl_page_retire on behalf of the FMA DE.
+ *
+ * Returns:
+ *
+ *   - 0 if the page is retired,
+ *   - EAGAIN if it is not, and
+ *   - EINVAL if the PA is whacko.
+ */
+int
+page_retire_check(uint64_t pa, uint64_t *errors)
+{
+	page_t	*pp;
+
+	if (errors) {
+		*errors = 0;
+	}
+
+	pp = page_numtopp_nolock(mmu_btop(pa));
+	if (pp == NULL) {
+		return (page_retire_done(pp, PRD_INVALID_PA));
+	}
+
+	return (page_retire_check_pp(pp, errors));
+}
+
+/*
+ * Page retire self-test. For now, it always returns 0.
+ */
+int
+page_retire_test(void)
+{
+	page_t *first, *pp, *cpp, *cpp2, *lpp;
+
+	/*
+	 * Tests the corner case where a large page can't be retired
+	 * because one of the constituent pages is locked. We mark
+	 * one page to be retired and try to retire it, and mark the
+	 * other page to be retired but don't try to retire it, so
+	 * that page_unlock() in the failure path will recurse and try
+	 * to retire THAT page. This is the worst possible situation
+	 * we can get ourselves into.
+	 */
+	memsegs_lock(0);
+	pp = first = page_first();
+	do {
+		if (pp->p_szc && PP_PAGEROOT(pp) == pp) {
+			cpp = pp + 1;
+			lpp = PP_ISFREE(pp)? pp : pp + 2;
+			cpp2 = pp + 3;
+			if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED))
+				continue;
+			if (!page_trylock(cpp, SE_EXCL)) {
+				page_unlock(lpp);
+				continue;
+			}
+			page_settoxic(cpp, PR_FMA | PR_BUSY);
+			page_settoxic(cpp2, PR_FMA);
+			page_tryretire(cpp);	/* will fail */
+			page_unlock(lpp);
+			(void) page_retire(cpp->p_pagenum, PR_FMA);
+			(void) page_retire(cpp2->p_pagenum, PR_FMA);
+		}
+	} while ((pp = page_next(pp)) != first);
+	memsegs_unlock(0);
+
+	return (0);
+}
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 5b3db34db1..27b2702d28 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -87,90 +87,6 @@ static pgcnt_t max_page_get;	/* max page_get request size in pages */
 pgcnt_t total_pages = 0;	/* total number of pages (used by /proc) */
 
 /*
- * vnode for all pages which are retired from the VM system;
- * such as pages with Uncorrectable Errors.
- */
-struct vnode retired_ppages;
-
-static void	page_retired_init(void);
-static void	retired_dispose(vnode_t *vp, page_t *pp, int flag,
-			int dn, cred_t *cr);
-static void	retired_inactive(vnode_t *vp, cred_t *cr);
-static void	page_retired(page_t *pp);
-static void	retired_page_removed(page_t *pp);
-void		page_unretire_pages(void);
-
-/*
- * The maximum number of pages that will be unretired in one iteration.
- * This number is totally arbitrary.
- */
-#define	UNRETIRE_PAGES		256
-
-/*
- * We limit the number of pages that may be retired to
- * a percentage of the total physical memory. Note that
- * the percentage values are  stored as 'basis points',
- * ie, 100 basis points is 1%.
- */
-#define	MAX_PAGES_RETIRED_BPS_DEFAULT	10	/* .1% */
-
-uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT;
-
-static int	pages_retired_limit_exceeded(void);
-
-/*
- * operations vector for vnode with retired pages. Only VOP_DISPOSE
- * and VOP_INACTIVE are intercepted.
- */
-struct vnodeops retired_vnodeops = {
-	"retired_vnodeops",
-	fs_nosys,	/* open */
-	fs_nosys,	/* close */
-	fs_nosys,	/* read */
-	fs_nosys,	/* write */
-	fs_nosys,	/* ioctl */
-	fs_nosys,	/* setfl */
-	fs_nosys,	/* getattr */
-	fs_nosys,	/* setattr */
-	fs_nosys,	/* access */
-	fs_nosys,	/* lookup */
-	fs_nosys,	/* create */
-	fs_nosys,	/* remove */
-	fs_nosys,	/* link */
-	fs_nosys,	/* rename */
-	fs_nosys,	/* mkdir */
-	fs_nosys,	/* rmdir */
-	fs_nosys,	/* readdir */
-	fs_nosys,	/* symlink */
-	fs_nosys,	/* readlink */
-	fs_nosys,	/* fsync */
-	retired_inactive,
-	fs_nosys,	/* fid */
-	fs_rwlock,	/* rwlock */
-	fs_rwunlock,	/* rwunlock */
-	fs_nosys,	/* seek */
-	fs_nosys,	/* cmp */
-	fs_nosys,	/* frlock */
-	fs_nosys,	/* space */
-	fs_nosys,	/* realvp */
-	fs_nosys,	/* getpage */
-	fs_nosys,	/* putpage */
-	fs_nosys_map,
-	fs_nosys_addmap,
-	fs_nosys,	/* delmap */
-	fs_nosys_poll,
-	fs_nosys,	/* dump */
-	fs_nosys,	/* l_pathconf */
-	fs_nosys,	/* pageio */
-	fs_nosys,	/* dumpctl */
-	retired_dispose,
-	fs_nosys,	/* setsecattr */
-	fs_nosys,	/* getsecatt */
-	fs_nosys,	/* shrlock */
-	fs_vnevent_nosupport	/* vnevent */
-};
-
-/*
  * freemem_lock protects all freemem variables:
  * availrmem. Also this lock protects the globals which track the
  * availrmem changes for accurate kernel footprint calculation.
@@ -289,15 +205,6 @@ static kcondvar_t	pcgs_cv;	/* cv for delay in pcgs */
 #define	PAGE_LOCK_MAXIMUM \
 	((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1)
 
-/*
- * Control over the verbosity of page retirement.  When set to zero, no messages
- * will be printed.  A value of one will trigger messages for retirement
- * operations, and is intended for processors which don't yet support FMA
- * (spitfire).  Two will cause verbose messages to be printed when retirements
- * complete, and is intended only for debugging purposes.
- */
-int page_retire_messages = 0;
-
 #ifdef VM_STATS
 
 /*
@@ -440,11 +347,7 @@ vm_init(void)
 
 	(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 	page_init_mem_config();
-
-	/*
-	 * initialise the vnode for retired pages
-	 */
-	page_retired_init();
+	page_retire_init();
 }
 
 /*
@@ -2799,153 +2702,6 @@ page_free(page_t *pp, int dontneed)
 	ASSERT((PAGE_EXCL(pp) &&
 	    !page_iolock_assert(pp)) || panicstr);
 
-	if (page_deteriorating(pp)) {
-		volatile int i = 0;
-		char *kaddr;
-		volatile int rb, wb;
-		uint64_t pa;
-		volatile int ue = 0;
-		on_trap_data_t otd;
-
-		if (pp->p_vnode != NULL) {
-			/*
-			 * Let page_destroy() do its bean counting and
-			 * hash out the page; it will then call back
-			 * into page_free() with pp->p_vnode == NULL.
-			 */
-			page_destroy(pp, 0);
-			return;
-		}
-
-		if (page_isfailing(pp)) {
-			/*
-			 * If we have already exceeded the limit for
-			 * pages retired, we will treat this page as
-			 * 'toxic' rather than failing. That will ensure
-			 * that the page is at least cleaned, and if
-			 * a UE is detected, the page will be retired
-			 * anyway.
-			 */
-			if (pages_retired_limit_exceeded()) {
-				/*
-				 * clear the flag and reset to toxic
-				 */
-				page_clrtoxic(pp);
-				page_settoxic(pp, PAGE_IS_TOXIC);
-			} else {
-				pa = ptob((uint64_t)page_pptonum(pp));
-				if (page_retire_messages) {
-					cmn_err(CE_NOTE, "Page 0x%08x.%08x "
-					    "removed from service",
-					    (uint32_t)(pa >> 32), (uint32_t)pa);
-				}
-				goto page_failed;
-			}
-		}
-
-		pagescrub(pp, 0, PAGESIZE);
-
-		/*
-		 * We want to determine whether the error that occurred on
-		 * this page is transient or persistent, so we get a mapping
-		 * to the page and try every possible bit pattern to compare
-		 * what we write with what we read back.  A smaller number
-		 * of bit patterns might suffice, but there's no point in
-		 * getting fancy.  If this is the hot path on your system,
-		 * you've got bigger problems.
-		 */
-		kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
-		for (wb = 0xff; wb >= 0; wb--) {
-			if (on_trap(&otd, OT_DATA_EC)) {
-				pa = ptob((uint64_t)page_pptonum(pp)) + i;
-				page_settoxic(pp, PAGE_IS_FAILING);
-
-				if (page_retire_messages) {
-					cmn_err(CE_WARN, "Uncorrectable Error "
-					    "occurred at PA 0x%08x.%08x while "
-					    "attempting to clear previously "
-					    "reported error; page removed from "
-					    "service", (uint32_t)(pa >> 32),
-					    (uint32_t)pa);
-				}
-
-				ue++;
-				break;
-			}
-
-			/*
-			 * Write out the bit pattern, flush it to memory, and
-			 * read it back while under on_trap() protection.
-			 */
-			for (i = 0; i < PAGESIZE; i++)
-				kaddr[i] = wb;
-
-			sync_data_memory(kaddr, PAGESIZE);
-
-			for (i = 0; i < PAGESIZE; i++) {
-				if ((rb = (uchar_t)kaddr[i]) != wb) {
-					page_settoxic(pp, PAGE_IS_FAILING);
-					goto out;
-				}
-			}
-		}
-out:
-		no_trap();
-		ppmapout(kaddr);
-
-		if (wb >= 0 && !ue) {
-			pa = ptob((uint64_t)page_pptonum(pp)) + i;
-			if (page_retire_messages) {
-				cmn_err(CE_WARN, "Data Mismatch occurred at PA "
-				    "0x%08x.%08x [ 0x%x != 0x%x ] while "
-				    "attempting to clear previously reported "
-				    "error; page removed from service",
-				    (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb);
-			}
-		}
-page_failed:
-		/*
-		 * DR operations change the association between a page_t
-		 * and the physical page it represents. Check if the
-		 * page is still bad. If it is, then retire it.
-		 */
-		if (page_isfaulty(pp) && page_isfailing(pp)) {
-			/*
-			 * In the future, it might be useful to have a platform
-			 * callback here to tell the hardware to fence off this
-			 * page during the next reboot.
-			 *
-			 * We move the page to the retired_vnode here
-			 */
-			(void) page_hashin(pp, &retired_ppages,
-			    (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL);
-			mutex_enter(&freemem_lock);
-			availrmem--;
-			mutex_exit(&freemem_lock);
-			page_retired(pp);
-			page_downgrade(pp);
-
-			/*
-			 * If DR raced with the above page retirement code,
-			 * we might have retired a good page. If so, unretire
-			 * the page.
-			 */
-			if (!page_isfaulty(pp))
-				page_unretire_pages();
-			return;
-		}
-
-		pa = ptob((uint64_t)page_pptonum(pp));
-
-		if (page_retire_messages) {
-			cmn_err(CE_NOTE, "Previously reported error on page "
-			    "0x%08x.%08x cleared", (uint32_t)(pa >> 32),
-			    (uint32_t)pa);
-		}
-
-		page_clrtoxic(pp);
-	}
-
 	if (PP_ISFREE(pp)) {
 		panic("page_free: page %p is free", (void *)pp);
 	}
@@ -3089,7 +2845,6 @@ page_free_pages(page_t *pp)
 	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
 	pgcnt_t	i;
 	uint_t	szc = pp->p_szc;
-	int	toxic = 0;
 
 	VM_STAT_ADD(pagecnt.pc_free_pages);
 	TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
@@ -3118,9 +2873,6 @@ page_free_pages(page_t *pp)
 		ASSERT(tpp->p_vnode == NULL);
 		ASSERT(tpp->p_szc == szc);
 
-		if (page_deteriorating(tpp))
-			toxic = 1;
-
 		PP_SETFREE(tpp);
 		page_clr_all_props(tpp);
 		PP_SETAGED(tpp);
@@ -3131,10 +2883,6 @@ page_free_pages(page_t *pp)
 	}
 	ASSERT(rootpp == pp);
 
-	if (toxic) {
-		page_free_toxic_pages(rootpp);
-		return;
-	}
 	page_list_add_pages(rootpp, 0);
 	page_create_putback(pgcnt);
 }
@@ -3219,12 +2967,13 @@ page_reclaim(page_t *pp, kmutex_t *lock)
 	struct pcf	*p;
 	uint_t		pcf_index;
 	struct cpu	*cpup;
-	int		enough;
 	uint_t		i;
+	pgcnt_t		npgs, need, collected;
 
 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
 	ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
-	ASSERT(pp->p_szc == 0);
+
+	npgs = page_get_pagecnt(pp->p_szc);
 
 	/*
 	 * If `freemem' is 0, we cannot reclaim this page from the
@@ -3254,18 +3003,19 @@ page_reclaim(page_t *pp, kmutex_t *lock)
 		goto page_reclaim_nomem;
 	}
 
-	enough = 0;
+	collected = 0;
 	pcf_index = PCF_INDEX();
 	p = &pcf[pcf_index];
 	p->pcf_touch = 1;
 	mutex_enter(&p->pcf_lock);
-	if (p->pcf_count >= 1) {
-		enough = 1;
-		p->pcf_count--;
+	if (p->pcf_count >= npgs) {
+		collected = npgs;
+		p->pcf_count -= npgs;
 	}
 	mutex_exit(&p->pcf_lock);
+	need = npgs - collected;
 
-	if (!enough) {
+	if (need > 0) {
 		VM_STAT_ADD(page_reclaim_zero);
 		/*
 		 * Check again. Its possible that some other thread
@@ -3277,15 +3027,22 @@ page_reclaim(page_t *pp, kmutex_t *lock)
 		for (i = 0; i < PCF_FANOUT; i++) {
 			p->pcf_touch = 1;
 			mutex_enter(&p->pcf_lock);
-			if (p->pcf_count >= 1) {
-				p->pcf_count -= 1;
-				enough = 1;
-				break;
+			if (p->pcf_count) {
+				if (p->pcf_count >= need) {
+					p->pcf_count -= need;
+					collected += need;
+					need = 0;
+					break;
+				} else if (p->pcf_count) {
+					collected += p->pcf_count;
+					need -= p->pcf_count;
+					p->pcf_count = 0;
+				}
 			}
 			p++;
 		}
 
-		if (!enough) {
+		if (need > 0) {
 page_reclaim_nomem:
 			/*
 			 * We really can't have page `pp'.
@@ -3309,6 +3066,7 @@ page_reclaim_nomem:
 			mutex_enter(&new_freemem_lock);
 
 			p = pcf;
+			p->pcf_count += collected;
 			for (i = 0; i < PCF_FANOUT; i++) {
 				p->pcf_wait++;
 				mutex_exit(&p->pcf_lock);
@@ -3328,11 +3086,13 @@ page_reclaim_nomem:
 		}
 
 		/*
-		 * There was a page to be found.
+		 * We beat the PCF bins over the head until
+		 * we got the memory that we wanted.
 		 * The pcf accounting has been done,
 		 * though none of the pcf_wait flags have been set,
 		 * drop the locks and continue on.
 		 */
+		ASSERT(collected == npgs);
 		while (p >= pcf) {
 			mutex_exit(&p->pcf_lock);
 			p--;
@@ -3343,14 +3103,19 @@ page_reclaim_nomem:
 	 * freemem is not protected by any lock. Thus, we cannot
 	 * have any assertion containing freemem here.
 	 */
-	freemem -= 1;
+	freemem -= npgs;
 
 	VM_STAT_ADD(pagecnt.pc_reclaim);
 	if (PP_ISAGED(pp)) {
-		page_list_sub(pp, PG_FREE_LIST);
+		if (npgs > 1) {
+			page_list_sub_pages(pp, pp->p_szc);
+		} else {
+			page_list_sub(pp, PG_FREE_LIST);
+		}
 		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
 		    "page_reclaim_free:pp %p", pp);
 	} else {
+		ASSERT(npgs == 1);
 		page_list_sub(pp, PG_CACHE_LIST);
 		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
 		    "page_reclaim_cache:pp %p", pp);
@@ -3363,9 +3128,11 @@ page_reclaim_nomem:
 	 *
 	 * Set the reference bit to protect against immediate pageout.
 	 */
-	PP_CLRFREE(pp);
-	PP_CLRAGED(pp);
-	page_set_props(pp, P_REF);
+	for (i = 0; i < npgs; i++, pp = page_next(pp)) {
+		PP_CLRFREE(pp);
+		PP_CLRAGED(pp);
+		page_set_props(pp, P_REF);
+	}
 
 	CPU_STATS_ENTER_K();
 	cpup = CPU;	/* get cpup now that CPU cannot change */
@@ -3441,7 +3208,6 @@ page_destroy_pages(page_t *pp)
 	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
 	pgcnt_t	i, pglcks = 0;
 	uint_t	szc = pp->p_szc;
-	int	toxic = 0;
 
 	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
 
@@ -3471,9 +3237,6 @@ page_destroy_pages(page_t *pp)
 		ASSERT(tpp->p_vnode == NULL);
 		ASSERT(tpp->p_szc == szc);
 
-		if (page_deteriorating(tpp))
-			toxic = 1;
-
 		PP_SETFREE(tpp);
 		page_clr_all_props(tpp);
 		PP_SETAGED(tpp);
@@ -3489,10 +3252,6 @@ page_destroy_pages(page_t *pp)
 		mutex_exit(&freemem_lock);
 	}
 
-	if (toxic) {
-		page_free_toxic_pages(rootpp);
-		return;
-	}
 	page_list_add_pages(rootpp, 0);
 	page_create_putback(pgcnt);
 }
@@ -3916,14 +3675,6 @@ page_hashout(page_t *pp, kmutex_t *phm)
 		mutex_exit(nphm);
 
 	/*
-	 * If the page was retired, update the pages_retired
-	 * total and clear the page flag
-	 */
-	if (page_isretired(pp)) {
-		retired_page_removed(pp);
-	}
-
-	/*
 	 * Wake up processes waiting for this page.  The page's
 	 * identity has been changed, and is probably not the
 	 * desired page any longer.
@@ -5397,6 +5148,63 @@ page_release(page_t *pp, int checkmod)
 	return (status);
 }
 
+/*
+ * Given a constituent page, try to demote the large page on the freelist.
+ *
+ * Returns nonzero if the page could be demoted successfully. Returns with
+ * the constituent page still locked.
+ */
+int
+page_try_demote_free_pages(page_t *pp)
+{
+	page_t *rootpp = pp;
+	pfn_t	pfn = page_pptonum(pp);
+	spgcnt_t npgs;
+	uint_t	szc = pp->p_szc;
+
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(PAGE_EXCL(pp));
+
+	/*
+	 * Adjust rootpp and lock it, if `pp' is not the base
+	 * constituent page.
+	 */
+	npgs = page_get_pagecnt(pp->p_szc);
+	if (npgs == 1) {
+		return (0);
+	}
+
+	if (!IS_P2ALIGNED(pfn, npgs)) {
+		pfn = P2ALIGN(pfn, npgs);
+		rootpp = page_numtopp_nolock(pfn);
+	}
+
+	if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
+		return (0);
+	}
+
+	if (rootpp->p_szc != szc) {
+		if (pp != rootpp)
+			page_unlock(rootpp);
+		return (0);
+	}
+
+	page_demote_free_pages(rootpp);
+
+	if (pp != rootpp)
+		page_unlock(rootpp);
+
+	ASSERT(PP_ISFREE(pp));
+	ASSERT(PAGE_EXCL(pp));
+	return (1);
+}
+
+/*
+ * Given a constituent page, try to demote the large page.
+ *
+ * Returns nonzero if the page could be demoted successfully. Returns with
+ * the constituent page still locked.
+ */
 int
 page_try_demote_pages(page_t *pp)
 {
@@ -5406,27 +5214,27 @@ page_try_demote_pages(page_t *pp)
 	uint_t	szc = pp->p_szc;
 	vnode_t *vp = pp->p_vnode;
 
-	ASSERT(PAGE_EXCL(rootpp));
+	ASSERT(PAGE_EXCL(pp));
 
 	VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
 
-	if (rootpp->p_szc == 0) {
+	if (pp->p_szc == 0) {
 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
 		return (1);
 	}
 
 	if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) {
 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
-		page_demote_vp_pages(rootpp);
+		page_demote_vp_pages(pp);
 		ASSERT(pp->p_szc == 0);
 		return (1);
 	}
 
 	/*
-	 * Adjust rootpp if  passed in is not the base
+	 * Adjust rootpp if passed in is not the base
 	 * constituent page.
 	 */
-	npgs = page_get_pagecnt(rootpp->p_szc);
+	npgs = page_get_pagecnt(pp->p_szc);
 	ASSERT(npgs > 1);
 	if (!IS_P2ALIGNED(pfn, npgs)) {
 		pfn = P2ALIGN(pfn, npgs);
@@ -5455,12 +5263,11 @@ page_try_demote_pages(page_t *pp)
 			break;
 		ASSERT(tpp->p_szc == rootpp->p_szc);
 		ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
-		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
 	}
 
 	/*
-	 * If we failed to lock them all then unlock what we have locked
-	 * so far and bail.
+	 * If we failed to lock them all then unlock what we have
+	 * locked so far and bail.
 	 */
 	if (i < npgs) {
 		tpp = rootpp;
@@ -5473,12 +5280,9 @@ page_try_demote_pages(page_t *pp)
 		return (0);
 	}
 
-	/*
-	 * XXX probably p_szc clearing and page unlocking can be done within
-	 * one loop but since this is rare code we can play very safe.
-	 */
 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
 		ASSERT(PAGE_EXCL(tpp));
+		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
 		tpp->p_szc = 0;
 	}
 
@@ -5490,6 +5294,7 @@ page_try_demote_pages(page_t *pp)
 		if (tpp != pp)
 			page_unlock(tpp);
 	}
+
 	VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
 	return (1);
 }
@@ -5579,221 +5384,6 @@ page_demote_vp_pages(page_t *pp)
 }
 
 /*
- * Page retire operation.
- *
- * page_retire()
- * Attempt to retire (throw away) page pp.  We cannot do this if
- * the page is dirty; if the page is clean, we can try.  We return 0 on
- * success, -1 on failure.  This routine should be invoked by the platform's
- * memory error detection code.
- *
- * pages_retired_limit_exceeded()
- * We set a limit on the number of pages which may be retired. This
- * is set to a percentage of total physical memory. This limit is
- * enforced here.
- */
-
-static pgcnt_t	retired_pgcnt = 0;
-
-/*
- * routines to update the count of retired pages
- */
-static void
-page_retired(page_t *pp)
-{
-	ASSERT(pp);
-
-	page_settoxic(pp, PAGE_IS_RETIRED);
-	atomic_add_long(&retired_pgcnt, 1);
-}
-
-static void
-retired_page_removed(page_t *pp)
-{
-	ASSERT(pp);
-	ASSERT(page_isretired(pp));
-	ASSERT(retired_pgcnt > 0);
-
-	page_clrtoxic(pp);
-	atomic_add_long(&retired_pgcnt, -1);
-}
-
-
-static int
-pages_retired_limit_exceeded()
-{
-	pgcnt_t	retired_max;
-
-	/*
-	 * If the percentage is zero or is not set correctly,
-	 * return TRUE so that pages are not retired.
-	 */
-	if (max_pages_retired_bps <= 0 ||
-	    max_pages_retired_bps >= 10000)
-		return (1);
-
-	/*
-	 * Calculate the maximum number of pages allowed to
-	 * be retired as a percentage of total physical memory
-	 * (Remember that we are using basis points, hence the 10000.)
-	 */
-	retired_max = (physmem * max_pages_retired_bps) / 10000;
-
-	/*
-	 * return 'TRUE' if we have already retired more
-	 * than the legal limit
-	 */
-	return (retired_pgcnt >= retired_max);
-}
-
-#define	PAGE_RETIRE_SELOCK	0
-#define	PAGE_RETIRE_NORECLAIM	1
-#define	PAGE_RETIRE_LOCKED	2
-#define	PAGE_RETIRE_COW		3
-#define	PAGE_RETIRE_DIRTY	4
-#define	PAGE_RETIRE_LPAGE	5
-#define	PAGE_RETIRE_SUCCESS	6
-#define	PAGE_RETIRE_LIMIT	7
-#define	PAGE_RETIRE_NCODES	8
-
-typedef struct page_retire_op {
-	int	pr_count;
-	short	pr_unlock;
-	short	pr_retval;
-	char	*pr_message;
-} page_retire_op_t;
-
-page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = {
-	{	0,	0,	-1,	"cannot lock page"		},
-	{	0,	0,	-1,	"cannot reclaim cached page"	},
-	{	0,	1,	-1,	"page is locked"		},
-	{	0,	1,	-1,	"copy-on-write page"		},
-	{	0,	1,	-1,	"page is dirty"			},
-	{	0,	1,	-1,	"cannot demote large page"	},
-	{	0,	0,	0,	"page successfully retired"	},
-	{	0,	0,	-1,	"excess pages retired already"	},
-};
-
-static int
-page_retire_done(page_t *pp, int code)
-{
-	page_retire_op_t *prop = &page_retire_ops[code];
-
-	prop->pr_count++;
-
-	if (prop->pr_unlock)
-		page_unlock(pp);
-
-	if (page_retire_messages > 1) {
-		printf("page_retire(%p) pfn 0x%lx %s: %s\n",
-		    (void *)pp, page_pptonum(pp),
-		    prop->pr_retval == -1 ? "failed" : "succeeded",
-		    prop->pr_message);
-	}
-
-	return (prop->pr_retval);
-}
-
-int
-page_retire(page_t *pp, uchar_t flag)
-{
-	uint64_t pa = ptob((uint64_t)page_pptonum(pp));
-
-	ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC);
-
-	/*
-	 * DR operations change the association between a page_t
-	 * and the physical page it represents. Check if the
-	 * page is still bad.
-	 */
-	if (!page_isfaulty(pp)) {
-		page_clrtoxic(pp);
-		return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
-	}
-
-	/*
-	 * We set the flag here so that even if we fail due
-	 * to exceeding the limit for retired pages, the
-	 * page will still be checked and either cleared
-	 * or retired in page_free().
-	 */
-	page_settoxic(pp, flag);
-
-	if (flag == PAGE_IS_TOXIC) {
-		if (page_retire_messages) {
-			cmn_err(CE_NOTE, "Scheduling clearing of error on"
-			    " page 0x%08x.%08x",
-			    (uint32_t)(pa >> 32), (uint32_t)pa);
-		}
-
-	} else { /* PAGE_IS_FAILING */
-		if (pages_retired_limit_exceeded()) {
-			/*
-			 * Return as we have already exceeded the
-			 * maximum number of pages allowed to be
-			 * retired
-			 */
-			return (page_retire_done(pp, PAGE_RETIRE_LIMIT));
-		}
-
-		if (page_retire_messages) {
-			cmn_err(CE_NOTE, "Scheduling removal of "
-			    "page 0x%08x.%08x",
-			    (uint32_t)(pa >> 32), (uint32_t)pa);
-		}
-	}
-
-	if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL))
-		return (page_retire_done(pp, PAGE_RETIRE_SELOCK));
-
-	/*
-	 * If this is a large page we first try and demote it
-	 * to PAGESIZE pages and then dispose of the toxic page.
-	 * On failure we will let the page free/destroy
-	 * code handle it later since this is a mapped page.
-	 * Note that free large pages can always be demoted.
-	 *
-	 */
-	if (pp->p_szc != 0) {
-		if (PP_ISFREE(pp))
-			(void) page_demote_free_pages(pp);
-		else
-			(void) page_try_demote_pages(pp);
-
-		if (pp->p_szc != 0)
-			return (page_retire_done(pp, PAGE_RETIRE_LPAGE));
-	}
-
-	if (PP_ISFREE(pp)) {
-		if (!page_reclaim(pp, NULL))
-			return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM));
-		/*LINTED: constant in conditional context*/
-		VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred)
-		return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
-	}
-
-	if (pp->p_lckcnt != 0)
-		return (page_retire_done(pp, PAGE_RETIRE_LOCKED));
-
-	if (pp->p_cowcnt != 0)
-		return (page_retire_done(pp, PAGE_RETIRE_COW));
-
-	/*
-	 * Unload all translations to this page.  No new translations
-	 * can be created while we hold the exclusive lock on the page.
-	 */
-	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
-
-	if (hat_ismod(pp))
-		return (page_retire_done(pp, PAGE_RETIRE_DIRTY));
-
-	/*LINTED: constant in conditional context*/
-	VN_DISPOSE(pp, B_INVAL, 0, kcred);
-
-	return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
-}
-
-/*
  * Mark any existing pages for migration in the given range
  */
 void
@@ -6128,140 +5718,6 @@ next:
 	}
 }
 
-/*
- * initialize the vnode for retired pages
- */
-static void
-page_retired_init(void)
-{
-	vn_setops(&retired_ppages, &retired_vnodeops);
-}
-
-/* ARGSUSED */
-static void
-retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr)
-{
-	panic("retired_dispose invoked");
-}
-
-/* ARGSUSED */
-static void
-retired_inactive(vnode_t *vp, cred_t *cr)
-{}
-
-void
-page_unretire_pages(void)
-{
-	page_t		*pp;
-	kmutex_t	*vphm;
-	vnode_t		*vp;
-	page_t		*rpages[UNRETIRE_PAGES];
-	pgcnt_t		i, npages, rmem;
-	uint64_t	pa;
-
-	rmem = 0;
-
-	for (;;) {
-		/*
-		 * We do this in 2 steps:
-		 *
-		 * 1. We walk the retired pages list and collect a list of
-		 *    pages that have the toxic field cleared.
-		 *
-		 * 2. We iterate through the page list and unretire each one.
-		 *
-		 * We have to do it in two steps on account of the mutexes that
-		 * we need to acquire.
-		 */
-
-		vp = &retired_ppages;
-		vphm = page_vnode_mutex(vp);
-		mutex_enter(vphm);
-
-		if ((pp = vp->v_pages) == NULL) {
-			mutex_exit(vphm);
-			break;
-		}
-
-		i = 0;
-		do {
-			ASSERT(pp != NULL);
-			ASSERT(pp->p_vnode == vp);
-
-			/*
-			 * DR operations change the association between a page_t
-			 * and the physical page it represents. Check if the
-			 * page is still bad. If not, unretire it.
-			 */
-			if (!page_isfaulty(pp))
-				rpages[i++] = pp;
-
-			pp = pp->p_vpnext;
-		} while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES));
-
-		mutex_exit(vphm);
-
-		npages = i;
-		for (i = 0; i < npages; i++) {
-			pp = rpages[i];
-			pa = ptob((uint64_t)page_pptonum(pp));
-
-			/*
-			 * Need to upgrade the shared lock to an exclusive
-			 * lock in order to hash out the page.
-			 *
-			 * The page could have been retired but the page lock
-			 * may not have been downgraded yet. If so, skip this
-			 * page. page_free() will call this function after the
-			 * lock is downgraded.
-			 */
-
-			if (!PAGE_SHARED(pp) || !page_tryupgrade(pp))
-				continue;
-
-			/*
-			 * Both page_free() and DR call this function. They
-			 * can potentially call this function at the same
-			 * time and race with each other.
-			 */
-			if (!page_isretired(pp) || page_isfaulty(pp)) {
-				page_downgrade(pp);
-				continue;
-			}
-
-			cmn_err(CE_NOTE,
-				"unretiring retired page 0x%08x.%08x",
-				(uint32_t)(pa >> 32), (uint32_t)pa);
-
-			/*
-			 * When a page is removed from the retired pages vnode,
-			 * its toxic field is also cleared. So, we do not have
-			 * to do that seperately here.
-			 */
-			page_hashout(pp, (kmutex_t *)NULL);
-
-			/*
-			 * This is a good page. So, free it.
-			 */
-			pp->p_vnode = NULL;
-			page_free(pp, 1);
-			rmem++;
-		}
-
-		/*
-		 * If the rpages array was filled up, then there could be more
-		 * retired pages that are not faulty. We need to iterate
-		 * again and unretire them. Otherwise, we are done.
-		 */
-		if (npages < UNRETIRE_PAGES)
-			break;
-	}
-
-	mutex_enter(&freemem_lock);
-	availrmem += rmem;
-	mutex_exit(&freemem_lock);
-}
-
 ulong_t mem_waiters 	= 0;
 ulong_t	max_count 	= 20;
 #define	MAX_DELAY	0x1ff
@@ -6621,90 +6077,48 @@ page_clr_all_props(page_t *pp)
 }
 
 /*
- * The following functions is called from free_vp_pages()
- * for an inexact estimate of a newly free'd page...
+ * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
  */
-ulong_t
-page_share_cnt(page_t *pp)
-{
-	return (hat_page_getshare(pp));
-}
-
-/*
- * The following functions are used in handling memory
- * errors.
- */
-
-int
-page_istoxic(page_t *pp)
-{
-	return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC);
-}
-
-int
-page_isfailing(page_t *pp)
-{
-	return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING);
-}
-
-int
-page_isretired(page_t *pp)
-{
-	return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED);
-}
-
 int
-page_deteriorating(page_t *pp)
+page_clear_lck_cow(page_t *pp, int adjust)
 {
-	return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0);
-}
+	int	f_amount;
 
-void
-page_settoxic(page_t *pp, uchar_t flag)
-{
-	uchar_t new_flag = 0;
-	while ((new_flag & flag) != flag) {
-		uchar_t old_flag = pp->p_toxic;
-		new_flag = old_flag | flag;
-		(void) cas8(&pp->p_toxic, old_flag, new_flag);
-		new_flag = ((volatile page_t *)pp)->p_toxic;
-	}
-}
+	ASSERT(PAGE_EXCL(pp));
 
-void
-page_clrtoxic(page_t *pp)
-{
 	/*
-	 * We don't need to worry about atomicity on the
-	 * p_toxic flag here as this is only called from
-	 * page_free() while holding an exclusive lock on
-	 * the page
+	 * The page_struct_lock need not be acquired here since
+	 * we require the caller hold the page exclusively locked.
 	 */
-	pp->p_toxic = PAGE_IS_OK;
-}
+	f_amount = 0;
+	if (pp->p_lckcnt) {
+		f_amount = 1;
+		pp->p_lckcnt = 0;
+	}
+	if (pp->p_cowcnt) {
+		f_amount += pp->p_cowcnt;
+		pp->p_cowcnt = 0;
+	}
 
-void
-page_clrtoxic_flag(page_t *pp, uchar_t flag)
-{
-	uchar_t new_flag = ((volatile page_t *)pp)->p_toxic;
-	while ((new_flag & flag) == flag) {
-		uchar_t old_flag = new_flag;
-		new_flag = old_flag & ~flag;
-		(void) cas8(&pp->p_toxic, old_flag, new_flag);
-		new_flag = ((volatile page_t *)pp)->p_toxic;
+	if (adjust && f_amount) {
+		mutex_enter(&freemem_lock);
+		availrmem += f_amount;
+		mutex_exit(&freemem_lock);
 	}
-}
 
-int
-page_isfaulty(page_t *pp)
-{
-	return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY);
+	return (f_amount);
 }
 
 /*
- * The following four functions are called from /proc code
- * for the /proc/<pid>/xmap interface.
+ * The following functions is called from free_vp_pages()
+ * for an inexact estimate of a newly free'd page...
  */
+ulong_t
+page_share_cnt(page_t *pp)
+{
+	return (hat_page_getshare(pp));
+}
+
 int
 page_isshared(page_t *pp)
 {
diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c
index 225e8d157f..994ddca8a6 100644
--- a/usr/src/uts/common/vm/vm_pagelist.c
+++ b/usr/src/uts/common/vm/vm_pagelist.c
@@ -1305,7 +1305,7 @@ page_list_add_pages(page_t *pp, int flags)
 			kcage_freemem_add(pgcnt);
 #endif
 		for (i = 0; i < pgcnt; i++, pp++)
-			page_unlock(pp);
+			page_unlock_noretire(pp);
 	}
 }
 
@@ -1753,7 +1753,7 @@ page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
 			phm = PAGE_HASH_MUTEX(index);
 			if (!mutex_tryenter(phm)) {
-				page_unlock(pp);
+				page_unlock_noretire(pp);
 				goto fail_promote;
 			}
 
@@ -1761,7 +1761,7 @@ page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
 			page_hashout(pp, phm);
 			mutex_exit(phm);
 			PP_SETAGED(pp);
-			page_unlock(pp);
+			page_unlock_noretire(pp);
 			which_list = PG_CACHE_LIST;
 		}
 		page_ctr_sub(mnode, mtype, pp, which_list);
@@ -2209,7 +2209,7 @@ page_trylock_cons(page_t *pp, se_t se)
 			 * have locked so far.
 			 */
 			while (first_pp != tpp) {
-				page_unlock(first_pp);
+				page_unlock_noretire(first_pp);
 				first_pp = first_pp->p_next;
 			}
 			return (0);
@@ -2575,7 +2575,7 @@ skipptcpcheck:
 			while (--i != (pgcnt_t)-1) {
 				pp = &spp[i];
 				ASSERT(PAGE_EXCL(pp));
-				page_unlock(pp);
+				page_unlock_noretire(pp);
 			}
 			return (0);
 		}
@@ -2584,7 +2584,7 @@ skipptcpcheck:
 		    !PP_ISFREE(pp)) {
 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
 			ASSERT(i == 0);
-			page_unlock(pp);
+			page_unlock_noretire(pp);
 			return (0);
 		}
 		if (PP_ISNORELOC(pp)) {
@@ -2592,7 +2592,7 @@ skipptcpcheck:
 			while (i != (pgcnt_t)-1) {
 				pp = &spp[i];
 				ASSERT(PAGE_EXCL(pp));
-				page_unlock(pp);
+				page_unlock_noretire(pp);
 				i--;
 			}
 			return (0);
@@ -2687,7 +2687,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
 			 */
 			while (pgcnt--) {
 				ASSERT(PAGE_EXCL(pp));
-				page_unlock(pp);
+				page_unlock_noretire(pp);
 				pp++;
 			}
 			/*
@@ -2702,7 +2702,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
 				ASSERT(PP_ISAGED(pp));
 				pp->p_szc = 0;
 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
-				page_unlock(pp);
+				page_unlock_noretire(pp);
 			}
 
 			if (replpp != NULL)
@@ -2734,7 +2734,7 @@ page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
 			page_sub(&replpp, rpp);
 			ASSERT(PAGE_EXCL(rpp));
 			ASSERT(!PP_ISFREE(rpp));
-			page_unlock(rpp);
+			page_unlock_noretire(rpp);
 		}
 		ASSERT(targpp == hpp);
 		ASSERT(replpp == NULL);
diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c
index 1d6cd158b4..5071dae5ee 100644
--- a/usr/src/uts/i86pc/os/machdep.c
+++ b/usr/src/uts/i86pc/os/machdep.c
@@ -226,6 +226,8 @@ mdboot(int cmd, int fcn, char *mdep, boolean_t invoke_cb)
 	if (invoke_cb)
 		(void) callb_execute_class(CB_CL_MDBOOT, NULL);
 
+	page_retire_hunt(page_retire_mdboot_cb);
+
 	/*
 	 * stop other cpus and raise our priority.  since there is only
 	 * one active cpu after this, and our priority will be too high
diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c
index 35d97e6b23..75f57ce39c 100644
--- a/usr/src/uts/sun4u/cpu/spitfire.c
+++ b/usr/src/uts/sun4u/cpu/spitfire.c
@@ -432,6 +432,7 @@ void
 cpu_setup(void)
 {
 	extern int page_retire_messages;
+	extern int page_retire_first_ue;
 	extern int at_flags;
 #if defined(SF_ERRATA_57)
 	extern caddr_t errata57_limit;
@@ -445,9 +446,11 @@ cpu_setup(void)
 
 	/*
 	 * Spitfire isn't currently FMA-aware, so we have to enable the
-	 * page retirement messages.
+	 * page retirement messages. We also change the default policy
+	 * for UE retirement to allow clearing of transient errors.
 	 */
 	page_retire_messages = 1;
+	page_retire_first_ue = 0;
 
 	/*
 	 * save the cache bootup state.
@@ -895,10 +898,7 @@ cpu_ce_error(struct regs *rp, ulong_t p_afar, ulong_t p_afsr,
 	    curthread->t_ontrap != NULL) {
 
 		if (curthread->t_ontrap->ot_prot & OT_DATA_EC) {
-			page_t *pp = page_numtopp_nolock((pfn_t)
-			    (ecc->flt_addr >> MMU_PAGESHIFT));
-
-			if (pp != NULL && page_isretired(pp)) {
+			if (page_retire_check(ecc->flt_addr, NULL) == 0) {
 				queue = 0;
 			}
 		}
@@ -1093,6 +1093,7 @@ cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep)
 	char unum[UNUM_NAMLEN];
 	int len = 0;
 	int ce_verbose = 0;
+	int err;
 
 	ASSERT(ecc->flt_func != NULL);
 
@@ -1107,15 +1108,9 @@ cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep)
 	 * Count errors per unum.
 	 * Non-memory errors are all counted via a special unum string.
 	 */
-	if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING &&
+	if ((err = ce_count_unum(ecc->flt_status, len, unum)) != PR_OK &&
 	    automatic_page_removal) {
-		page_t *pp = page_numtopp_nolock((pfn_t)
-		    (ecc->flt_addr >> MMU_PAGESHIFT));
-
-		if (pp) {
-			page_settoxic(pp, PAGE_IS_FAULTY);
-			(void) page_retire(pp, PAGE_IS_FAILING);
-		}
+		(void) page_retire(ecc->flt_addr, err);
 	}
 
 	if (ecc->flt_panic) {
@@ -2092,11 +2087,7 @@ cpu_async_log_err(void *flt)
 		if (!panicstr &&
 		    (aflt->flt_stat & S_AFSR_ALL_ERRS) == P_AFSR_UE &&
 		    aflt->flt_prot == AFLT_PROT_EC) {
-			page_t *pp = page_numtopp_nolock((pfn_t)
-			    (aflt->flt_addr >> MMU_PAGESHIFT));
-
-			if (pp != NULL && page_isretired(pp)) {
-
+			if (page_retire_check(aflt->flt_addr, NULL) == 0) {
 				/* Zero the address to clear the error */
 				softcall(ecc_page_zero, (void *)aflt->flt_addr);
 				return;
@@ -2305,25 +2296,7 @@ cpu_async_log_err(void *flt)
 
 	if (aflt->flt_addr != AFLT_INV_ADDR && aflt->flt_in_memory) {
 		if (!panicstr) {
-			/*
-			 * Retire the bad page that caused the error
-			 */
-			page_t *pp = page_numtopp_nolock((pfn_t)
-			    (aflt->flt_addr >> MMU_PAGESHIFT));
-
-			if (pp != NULL) {
-				page_settoxic(pp, PAGE_IS_FAULTY);
-				(void) page_retire(pp, PAGE_IS_TOXIC);
-			} else {
-				uint64_t pa =
-				    P2ALIGN(aflt->flt_addr, MMU_PAGESIZE);
-
-				cpu_aflt_log(CE_CONT, 3, spf_flt,
-				    CPU_ERRID_FIRST, NULL,
-				    ": cannot schedule clearing of error on "
-				    "page 0x%08x.%08x; page not in VM system",
-				    (uint32_t)(pa >> 32), (uint32_t)pa);
-			}
+			(void) page_retire(aflt->flt_addr, PR_UE);
 		} else {
 			/*
 			 * Clear UEs on panic so that we don't
@@ -4089,12 +4062,7 @@ static void
 ecache_page_retire(void *arg)
 {
 	uint64_t paddr = (uint64_t)arg;
-	page_t *pp = page_numtopp_nolock((pfn_t)(paddr >> MMU_PAGESHIFT));
-
-	if (pp) {
-		page_settoxic(pp, PAGE_IS_FAULTY);
-		(void) page_retire(pp, PAGE_IS_TOXIC);
-	}
+	(void) page_retire(paddr, PR_UE);
 }
 
 void
@@ -4331,15 +4299,14 @@ add_leaky_bucket_timeout(void)
  * false intermittents, so these intermittents can be safely ignored.
  *
  * If the error count is excessive for a DIMM, this function will return
- * PAGE_IS_FAILING, and the CPU module may then decide to remove that page
- * from use.
+ * PR_MCE, and the CPU module may then decide to remove that page from use.
  */
 static int
 ce_count_unum(int status, int len, char *unum)
 {
 	int i;
 	struct ce_info *psimm = mem_ce_simm;
-	int page_status = PAGE_IS_OK;
+	int page_status = PR_OK;
 
 	ASSERT(psimm != NULL);
 
@@ -4375,7 +4342,7 @@ ce_count_unum(int status, int len, char *unum)
 				cmn_err(CE_WARN,
 				    "[AFT0] Sticky Softerror encountered "
 				    "on Memory Module %s\n", unum);
-				page_status = PAGE_IS_FAILING;
+				page_status = PR_MCE;
 			} else if (status & ECC_PERSISTENT) {
 				psimm[i].leaky_bucket_cnt = 1;
 				psimm[i].intermittent_total = 0;
@@ -4404,7 +4371,7 @@ ce_count_unum(int status, int len, char *unum)
 				cmn_err(CE_WARN,
 				    "[AFT0] Sticky Softerror encountered "
 				    "on Memory Module %s\n", unum);
-				page_status = PAGE_IS_FAILING;
+				page_status = PR_MCE;
 			} else if (status & ECC_PERSISTENT) {
 				int new_value;
 
@@ -4422,7 +4389,7 @@ ce_count_unum(int status, int len, char *unum)
 					    ecc_softerr_interval % 60);
 					atomic_add_16(
 					    &psimm[i].leaky_bucket_cnt, -1);
-					page_status = PAGE_IS_FAILING;
+					page_status = PR_MCE;
 				}
 			} else { /* Intermittent */
 				psimm[i].intermittent_total++;
@@ -4444,15 +4411,11 @@ ce_count_unum(int status, int len, char *unum)
 void
 cpu_ce_count_unum(struct async_flt *ecc, int len, char *unum)
 {
-	if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING &&
-	    automatic_page_removal) {
-		page_t *pp = page_numtopp_nolock((pfn_t)
-		    (ecc->flt_addr >> MMU_PAGESHIFT));
+	int err;
 
-		if (pp) {
-			page_settoxic(pp, PAGE_IS_FAULTY);
-			(void) page_retire(pp, PAGE_IS_FAILING);
-		}
+	err = ce_count_unum(ecc->flt_status, len, unum);
+	if (err != PR_OK && automatic_page_removal) {
+		(void) page_retire(ecc->flt_addr, err);
 	}
 }
 
diff --git a/usr/src/uts/sun4u/cpu/us3_common.c b/usr/src/uts/sun4u/cpu/us3_common.c
index f8d8b2bb77..f7cc35c664 100644
--- a/usr/src/uts/sun4u/cpu/us3_common.c
+++ b/usr/src/uts/sun4u/cpu/us3_common.c
@@ -2205,7 +2205,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
 {
 	ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt;
 	struct async_flt *aflt = (struct async_flt *)flt;
-	page_t *pp;
+	uint64_t errors;
 
 	switch (ch_flt->flt_type) {
 	case CPU_INV_AFSR:
@@ -2236,9 +2236,6 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
 	 */
 	case CPU_CE:
 	case CPU_EMC:
-		pp = page_numtopp_nolock((pfn_t)
-		    (aflt->flt_addr >> MMU_PAGESHIFT));
-
 		/*
 		 * We want to skip logging and further classification
 		 * only if ALL the following conditions are true:
@@ -2258,7 +2255,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
 		    (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE &&
 		    aflt->flt_prot == AFLT_PROT_EC) {
 
-			if (pp != NULL && page_isretired(pp)) {
+			if (page_retire_check(aflt->flt_addr, NULL) == 0) {
 			    if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
 
 				/*
@@ -2289,17 +2286,17 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
 		 *
 		 * Note: Check cpu_impl_async_log_err if changing this
 		 */
-		if (pp) {
-			if (page_isretired(pp) || page_deteriorating(pp)) {
+		if (page_retire_check(aflt->flt_addr, &errors) == EINVAL) {
+			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+			    CE_XDIAG_SKIP_NOPP);
+		} else {
+			if (errors != PR_OK) {
 				CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
 				    CE_XDIAG_SKIP_PAGEDET);
 			} else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep,
 			    offsetof(ch_async_flt_t, cmn_asyncflt))) {
 				return (0);
 			}
-		} else {
-			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
-			    CE_XDIAG_SKIP_NOPP);
 		}
 		/*FALLTHRU*/
 
@@ -2325,11 +2322,7 @@ cpu_async_log_err(void *flt, errorq_elem_t *eqep)
 		if (!panicstr && (ch_flt->afsr_errs &
 		    (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_UE &&
 		    aflt->flt_prot == AFLT_PROT_EC) {
-			page_t *pp = page_numtopp_nolock((pfn_t)
-			    (aflt->flt_addr >> MMU_PAGESHIFT));
-
-			if (pp != NULL && page_isretired(pp)) {
-
+			if (page_retire_check(aflt->flt_addr, NULL) == 0) {
 				/* Zero the address to clear the error */
 				softcall(ecc_page_zero, (void *)aflt->flt_addr);
 				return (0);
@@ -2387,12 +2380,7 @@ void
 cpu_page_retire(ch_async_flt_t *ch_flt)
 {
 	struct async_flt *aflt = (struct async_flt *)ch_flt;
-	page_t *pp = page_numtopp_nolock(aflt->flt_addr >> MMU_PAGESHIFT);
-
-	if (pp != NULL) {
-		page_settoxic(pp, PAGE_IS_FAULTY);
-		(void) page_retire(pp, PAGE_IS_TOXIC);
-	}
+	(void) page_retire(aflt->flt_addr, PR_UE);
 }
 
 /*
diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno.c b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
index cd71848200..0b7936d426 100644
--- a/usr/src/uts/sun4u/cpu/us3_jalapeno.c
+++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
@@ -316,7 +316,7 @@ cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep)
 {
 	ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt;
 	struct async_flt *aflt = (struct async_flt *)flt;
-	page_t *pp;
+	uint64_t errors;
 
 	switch (ch_flt->flt_type) {
 
@@ -329,19 +329,15 @@ cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep)
 		return (CH_ASYNC_LOG_DONE);
 
 	case CPU_RCE:
-		pp = page_numtopp_nolock((pfn_t)
-		    (aflt->flt_addr >> MMU_PAGESHIFT));
-		if (pp) {
-			if (page_isretired(pp) || page_deteriorating(pp)) {
-				CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
-				    CE_XDIAG_SKIP_PAGEDET);
-			} else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep,
-			    offsetof(ch_async_flt_t, cmn_asyncflt))) {
-				return (CH_ASYNC_LOG_RECIRC);
-			}
-		} else {
+		if (page_retire_check(aflt->flt_addr, &errors) == EINVAL) {
 			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
 			    CE_XDIAG_SKIP_NOPP);
+		} else if (errors != PR_OK) {
+			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+			    CE_XDIAG_SKIP_PAGEDET);
+		} else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep,
+		    offsetof(ch_async_flt_t, cmn_asyncflt))) {
+			return (CH_ASYNC_LOG_RECIRC);
 		}
 		/*FALLTHRU*/
 	/*
diff --git a/usr/src/uts/sun4u/io/pci/pci_ecc.c b/usr/src/uts/sun4u/io/pci/pci_ecc.c
index 0f92d73663..8820683ba6 100644
--- a/usr/src/uts/sun4u/io/pci/pci_ecc.c
+++ b/usr/src/uts/sun4u/io/pci/pci_ecc.c
@@ -534,21 +534,21 @@ ecc_err_handler(ecc_errstate_t *ecc_err_p)
  * Called from ecc_err_drain below for CBINTR_CE case.
  */
 static int
-ecc_err_cexdiag(page_t *pp, ecc_errstate_t *ecc_err,
-		errorq_elem_t *eqep)
+ecc_err_cexdiag(ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
 {
 	struct async_flt *ecc = &ecc_err->ecc_aflt;
+	uint64_t errors;
 
-	if (!pp) {
+	if (page_retire_check(ecc->flt_addr, &errors) == EINVAL) {
 		CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP);
 		return (0);
-	} else if (page_isretired(pp) || page_deteriorating(pp)) {
+	} else if (errors != PR_OK) {
 		CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET);
 		return (0);
+	} else {
+		return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep,
+		    offsetof(ecc_errstate_t, ecc_aflt)));
 	}
-
-	return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep,
-	    offsetof(ecc_errstate_t, ecc_aflt)));
 }
 
 /*
@@ -561,7 +561,6 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
 {
 	struct async_flt *ecc = &ecc_err->ecc_aflt;
 	pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0];
-	page_t *pp;
 	int ecc_type = ecc_err->ecc_ii_p.ecc_type;
 
 	if (pci_p == NULL)
@@ -581,13 +580,10 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
 	ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ?
 			ECC_IO_UE : ECC_IO_CE);
 
-	pp = page_numtopp_nolock(ecc->flt_addr >> MMU_PAGESHIFT);
-
 	switch (ecc_type) {
 	case CBNINTR_UE:
-		if (pp && ecc_err->ecc_pg_ret == 1) {
-			page_settoxic(pp, PAGE_IS_FAULTY);
-			(void) page_retire(pp, PAGE_IS_TOXIC);
+		if (ecc_err->ecc_pg_ret == 1) {
+			(void) page_retire(ecc->flt_addr, PR_UE);
 		}
 		ecc_err->ecc_err_type = flt_to_error_type(ecc);
 		break;
@@ -609,7 +605,7 @@ ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
 
 		/* ecc_err_cexdiag returns nonzero to recirculate */
 		if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) &&
-		    ecc_err_cexdiag(pp, ecc_err, eqep))
+		    ecc_err_cexdiag(ecc_err, eqep))
 			return;
 		ecc_err->ecc_err_type = flt_to_error_type(ecc);
 		break;
diff --git a/usr/src/uts/sun4u/ngdr/io/dr_mem.c b/usr/src/uts/sun4u/ngdr/io/dr_mem.c
index e876db93b5..1dd67f5824 100644
--- a/usr/src/uts/sun4u/ngdr/io/dr_mem.c
+++ b/usr/src/uts/sun4u/ngdr/io/dr_mem.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -80,8 +80,6 @@ static struct memlist	*memlist_del_span(struct memlist *mlist,
 static struct memlist	*memlist_cat_span(struct memlist *mlist,
 					uint64_t base, uint64_t len);
 
-extern void		page_unretire_pages(void);
-
 /*
  * dr_mem_unit_t.sbm_flags
  */
@@ -427,57 +425,13 @@ dr_mem_ecache_scrub(dr_mem_unit_t *mp, struct memlist *mlist)
 #endif /* DEBUG */
 }
 
-/*
- * This function marks as clean, all the faulty pages that belong to the
- * board that is copy-renamed since they are not likely to be bad pages
- * after the rename. This includes the retired pages on the board.
- */
-
-static void
-dr_memlist_clrpages(struct memlist *r_ml)
-{
-	struct memlist	*t_ml;
-	page_t		*pp, *epp;
-	pfn_t		pfn, epfn;
-	struct memseg	*seg;
-
-	if (r_ml == NULL)
-		return;
-
-	for (t_ml = r_ml; (t_ml != NULL); t_ml = t_ml->next) {
-		pfn = _b64top(t_ml->address);
-		epfn = _b64top(t_ml->address + t_ml->size);
-
-		for (seg = memsegs; seg != NULL; seg = seg->next) {
-			if (pfn >= seg->pages_end || epfn < seg->pages_base)
-				continue;
-
-			pp = seg->pages;
-			if (pfn > seg->pages_base)
-				pp += pfn - seg->pages_base;
-
-			epp = seg->epages;
-			if (epfn < seg->pages_end)
-				epp -= seg->pages_end - epfn;
-
-			ASSERT(pp < epp);
-			while (pp < epp) {
-				if (page_isfaulty((page_t *)pp))
-					page_clrtoxic_flag((page_t *)pp,
-					    PAGE_IS_FAULTY);
-				pp++;
-			}
-		}
-	}
-}
-
 static int
 dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
 {
 	time_t		 copytime;
 	drmachid_t	 cr_id;
 	dr_sr_handle_t	*srhp;
-	struct memlist	*c_ml, *d_ml, *r_ml;
+	struct memlist	*c_ml, *d_ml;
 	sbd_error_t	*err;
 	static fn_t	 f = "dr_move_memory";
 
@@ -507,11 +461,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
 		d_ml = d_ml->next;
 	}
 
-	/*
-	 * create a copy of the memlist to be used for retiring pages.
-	 */
-	r_ml = memlist_dup(c_ml);
-
 	affinity_set(drmach_mem_cpu_affinity(t_mp->sbm_cm.sbdev_id));
 
 	err = drmach_copy_rename_init(
@@ -520,7 +469,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
 	if (err) {
 		DRERR_SET_C(&s_mp->sbm_cm.sbdev_error, &err);
 		affinity_clear();
-		memlist_delete(r_ml);
 		return (-1);
 	}
 
@@ -553,7 +501,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
 		hp->h_err = NULL;
 
 		affinity_clear();
-		memlist_delete(r_ml);
 		return (-1);
 	}
 
@@ -573,12 +520,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
 
 	drmach_copy_rename(cr_id);
 
-	/*
-	 * Clear pages that have been marked as faulty since we are
-	 * changing the physical memory for the pages.
-	 */
-	dr_memlist_clrpages(r_ml);
-
 	/* Resume the OS.  */
 	dr_resume(srhp);
 
@@ -594,11 +535,6 @@ dr_move_memory(dr_handle_t *hp, dr_mem_unit_t *s_mp, dr_mem_unit_t *t_mp)
 	PR_MEM("%s: copy-rename elapsed time = %ld ticks (%ld secs)\n",
 		f, copytime, copytime / hz);
 
-	memlist_delete(r_ml);
-
-	/* Unretire any pages cleared after copy-rename */
-	page_unretire_pages();
-
 	/* return -1 if dr_suspend or copy/rename recorded an error */
 	return (err == NULL ? 0 : -1);
 }
diff --git a/usr/src/uts/sun4u/os/ecc.c b/usr/src/uts/sun4u/os/ecc.c
index 10b6cb523f..bd933377e4 100644
--- a/usr/src/uts/sun4u/os/ecc.c
+++ b/usr/src/uts/sun4u/os/ecc.c
@@ -247,23 +247,21 @@ error_init(void)
 }
 
 /*
- * Success flags for ecc_page_zero
+ * Flags for ecc_page_zero DTrace probe since ecc_page_zero() is called
+ * as a softint handler.
  */
 #define	PAGE_ZERO_SUCCESS	0
 #define	PAGE_ZERO_FAIL_NOLOCK	1
 #define	PAGE_ZERO_FAIL_ONTRAP	2
 
-/*
- * arg is a physical address - zero out the page that contains it
- */
 void
 ecc_page_zero(void *arg)
 {
 	uint64_t pa = (uint64_t)arg;
-	page_t *pp = page_numtopp_nolock((pfn_t)(pa >> MMU_PAGESHIFT));
 	int ret, success_flag;
+	page_t *pp = page_numtopp_nolock(mmu_btop(pa));
 
-	if (pp == NULL || !page_isretired(pp))
+	if (page_retire_check(pa, NULL) != 0)
 		return;
 
 	/*
diff --git a/usr/src/uts/sun4u/os/mach_cpu_states.c b/usr/src/uts/sun4u/os/mach_cpu_states.c
index 0815f54170..4144c91c79 100644
--- a/usr/src/uts/sun4u/os/mach_cpu_states.c
+++ b/usr/src/uts/sun4u/os/mach_cpu_states.c
@@ -66,7 +66,6 @@ extern int disable_watchdog_on_exit;
 void
 mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
 {
-	page_t *first, *pp;
 	extern void pm_cfb_check_and_powerup(void);
 
 	/*
@@ -79,25 +78,6 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
 	}
 
 	/*
-	 * Clear any unresolved UEs from memory.  We rely on the fact that on
-	 * sun4u, pagezero() will always clear UEs.  Since we're rebooting, we
-	 * just force p_selock to appear locked so pagezero()'s assert works.
-	 *
-	 * Pages that were retired successfully due to multiple CEs will
-	 * also be cleared.
-	 */
-	if (memsegs != NULL) {
-		pp = first = page_first();
-		do {
-			if (page_isretired(pp) || page_istoxic(pp)) {
-				/* pagezero asserts PAGE_LOCKED */
-				pp->p_selock = -1;
-				pagezero(pp, 0, PAGESIZE);
-			}
-		} while ((pp = page_next(pp)) != first);
-	}
-
-	/*
 	 * XXX - rconsvp is set to NULL to ensure that output messages
 	 * are sent to the underlying "hardware" device using the
 	 * monitor's printf routine since we are in the process of
@@ -123,6 +103,12 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
 		(void) callb_execute_class(CB_CL_MDBOOT, NULL);
 
 	/*
+	 * Clear any unresolved UEs from memory.
+	 */
+	if (memsegs != NULL)
+		page_retire_hunt(page_retire_mdboot_cb);
+
+	/*
 	 * stop other cpus which also raise our priority. since there is only
 	 * one active cpu after this, and our priority will be too high
 	 * for us to be preempted, we're essentially single threaded
diff --git a/usr/src/uts/sun4v/os/error.c b/usr/src/uts/sun4v/os/error.c
index 9d13b1781b..bd2b7fde49 100644
--- a/usr/src/uts/sun4v/os/error.c
+++ b/usr/src/uts/sun4v/os/error.c
@@ -87,8 +87,7 @@ static uint32_t rq_overflow_count = 0;		/* counter for rq overflow */
 
 static void cpu_queue_one_event(errh_async_flt_t *);
 static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t);
-static void errh_page_settoxic(errh_async_flt_t *, uchar_t);
-static void errh_page_retire(errh_async_flt_t *);
+static void errh_page_retire(errh_async_flt_t *, uchar_t);
 static int errh_error_protected(struct regs *, struct async_flt *, int *);
 static void errh_rq_full(struct async_flt *);
 static void ue_drain(void *, struct async_flt *, errorq_elem_t *);
@@ -300,12 +299,10 @@ process_nonresumable_error(struct regs *rp, uint64_t tl,
 		}
 
 		/*
-		 * If it is a memory error, we turn on the PAGE_IS_TOXIC
-		 * flag. The page will be retired later and scrubbed when
-		 * it is freed.
+		 * Call page_retire() to handle memory errors.
 		 */
 		if (errh_flt.errh_er.attr & ERRH_ATTR_MEM)
-			(void) errh_page_settoxic(&errh_flt, PAGE_IS_TOXIC);
+			errh_page_retire(&errh_flt, PR_UE);
 
 		/*
 		 * If we queued an error and the it was in user mode or
@@ -443,10 +440,10 @@ cpu_async_log_err(void *flt)
 	case ERRH_DESC_UCOR_RE:
 		if (errh_erp->attr & ERRH_ATTR_MEM) {
 			/*
-			 * Turn on the PAGE_IS_TOXIC flag. The page will be
+			 * Turn on the PR_UE flag. The page will be
 			 * scrubbed when it is freed.
 			 */
-			(void) errh_page_settoxic(errh_fltp, PAGE_IS_TOXIC);
+			errh_page_retire(errh_fltp, PR_UE);
 		}
 
 		break;
@@ -458,7 +455,7 @@ cpu_async_log_err(void *flt)
 			 * For non-resumable memory error, retire
 			 * the page here.
 			 */
-			errh_page_retire(errh_fltp);
+			errh_page_retire(errh_fltp, PR_UE);
 
 			/*
 			 * If we are going to panic, scrub the page first
@@ -518,9 +515,8 @@ cpu_ue_log_err(struct async_flt *aflt)
  * Turn on flag on the error memory region.
  */
 static void
-errh_page_settoxic(errh_async_flt_t *errh_fltp, uchar_t flag)
+errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag)
 {
-	page_t *pp;
 	uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
 	uint64_t flt_real_addr_end = flt_real_addr_start +
 	    errh_fltp->errh_er.sz - 1;
@@ -531,38 +527,7 @@ errh_page_settoxic(errh_async_flt_t *errh_fltp, uchar_t flag)
 
 	for (current_addr = flt_real_addr_start;
 	    current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
-		pp = page_numtopp_nolock((pfn_t)
-		    (current_addr >> MMU_PAGESHIFT));
-
-		if (pp != NULL) {
-			page_settoxic(pp, flag);
-		}
-	}
-}
-
-/*
- * Retire the page(s) indicated in the error report.
- */
-static void
-errh_page_retire(errh_async_flt_t *errh_fltp)
-{
-	page_t *pp;
-	uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
-	uint64_t flt_real_addr_end = flt_real_addr_start +
-	    errh_fltp->errh_er.sz - 1;
-	int64_t current_addr;
-
-	if (errh_fltp->errh_er.sz == 0)
-		return;
-
-	for (current_addr = flt_real_addr_start;
-	    current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
-		pp = page_numtopp_nolock((pfn_t)
-		    (current_addr >> MMU_PAGESHIFT));
-
-		if (pp != NULL) {
-			(void) page_retire(pp, PAGE_IS_TOXIC);
-		}
+		(void) page_retire(current_addr, flag);
 	}
 }
 
diff --git a/usr/src/uts/sun4v/os/mach_cpu_states.c b/usr/src/uts/sun4v/os/mach_cpu_states.c
index 75e2421e69..46c1fdbeff 100644
--- a/usr/src/uts/sun4v/os/mach_cpu_states.c
+++ b/usr/src/uts/sun4v/os/mach_cpu_states.c
@@ -106,29 +106,9 @@ extern uint64_t get_cpuaddr(uint64_t, uint64_t);
 void
 mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
 {
-	page_t *first, *pp;
 	extern void pm_cfb_check_and_powerup(void);
 
 	/*
-	 * Clear any unresolved UEs from memory.  We rely on the fact that on
-	 * sun4u, pagezero() will always clear UEs.  Since we're rebooting, we
-	 * just force p_selock to appear locked so pagezero()'s assert works.
-	 *
-	 * Pages that were retired successfully due to multiple CEs will
-	 * also be cleared.
-	 */
-	if (memsegs != NULL) {
-		pp = first = page_first();
-		do {
-			if (page_isretired(pp) || page_istoxic(pp)) {
-				/* pagezero asserts PAGE_LOCKED */
-				pp->p_selock = -1;
-				pagezero(pp, 0, PAGESIZE);
-			}
-		} while ((pp = page_next(pp)) != first);
-	}
-
-	/*
 	 * XXX - rconsvp is set to NULL to ensure that output messages
 	 * are sent to the underlying "hardware" device using the
 	 * monitor's printf routine since we are in the process of
@@ -154,6 +134,12 @@ mdboot(int cmd, int fcn, char *bootstr, boolean_t invoke_cb)
 		(void) callb_execute_class(CB_CL_MDBOOT, NULL);
 
 	/*
+	 * Clear any unresolved UEs from memory.
+	 */
+	if (memsegs != NULL)
+		page_retire_hunt(page_retire_mdboot_cb);
+
+	/*
 	 * stop other cpus which also raise our priority. since there is only
 	 * one active cpu after this, and our priority will be too high
 	 * for us to be preempted, we're essentially single threaded