17 files changed, 33462 insertions, 0 deletions
diff --git a/usr/src/uts/sun4u/cpu/cheetah_copy.s b/usr/src/uts/sun4u/cpu/cheetah_copy.s
new file mode 100644
index 0000000000..44961025d1
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/cheetah_copy.s
@@ -0,0 +1,3795 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma	ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/asm_linkage.h>
+#include <sys/vtrace.h>
+#include <sys/machthread.h>
+#include <sys/clock.h>
+#include <sys/asi.h>
+#include <sys/fsr.h>
+#include <sys/privregs.h>
+#include <sys/fpras_impl.h>
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* lint */
+
+/*
+ * Pseudo-code to aid in understanding the control flow of the
+ * bcopy/copyin/copyout routines.
+ *
+ * On entry:
+ *
+ * 	! Determine whether to use the FP register version
+ * 	! or the leaf routine version depending on size
+ * 	! of copy and flags.  Set up error handling accordingly.
+ *	! The transition point depends on whether the src and
+ * 	! dst addresses can be aligned to long word, word,
+ * 	! half word, or byte boundaries.
+ *	!
+ *	! WARNING: <Register usage convention>
+ *	! For FP version, %l6 holds previous error handling and
+ *	! a flag: TRAMP_FLAG (low bits)
+ *	! for leaf routine version, %o4 holds those values.
+ *	! So either %l6 or %o4 is reserved and not available for
+ *	! any other use.
+ *
+ * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
+ * 		go to small_copy;		! to speed short copies
+ * 
+ * 	! src, dst long word alignable
+ * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
+ * 			go to small_copy;
+ *		if (length <= hw_copy_limit_8)
+ * 			go to small_copy;
+ * 		go to FPBLK_copy;
+ * 	}
+ * 	if (src,dst not alignable) {
+ * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
+ * 			go to small_copy;
+ *		if (length <= hw_copy_limit_1)
+ * 			go to small_copy;
+ * 		go to FPBLK_copy;
+ * 	}
+ * 	if (src,dst halfword alignable) {
+ * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
+ * 			go to small_copy;
+ *		if (length <= hw_copy_limit_2)
+ * 			go to small_copy;
+ * 		go to FPBLK_copy;
+ * 	}
+ * 	if (src,dst word alignable) {
+ * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
+ * 			go to small_copy;
+ *		if (length <= hw_copy_limit_4)
+ * 			go to small_copy;
+ * 		go to FPBLK_copy;
+ * 	}
+ *
+ * small_copy:
+ *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
+ *	
+ *	if (count <= 3)				! fast path for tiny copies
+ *		go to sm_left;			! special finish up code
+ *	else
+ *		if (count > CHKSIZE)		! medium sized copies
+ *			go to sm_med		! tuned by alignment
+ *		if(src&dst not both word aligned) {
+ *	sm_movebytes:
+ *			move byte by byte in 4-way unrolled loop
+ *			fall into sm_left;
+ *	sm_left:
+ *			move 0-3 bytes byte at a time as needed.
+ *			restore error handler and exit.
+ *
+ * 		} else {	! src&dst are word aligned
+ *			check for at least 8 bytes left,
+ *			move word at a time, unrolled by 2
+ *			when fewer than 8 bytes left,
+ *	sm_half:	move half word at a time while 2 or more bytes left
+ *	sm_byte:	move final byte if necessary
+ *	sm_exit:
+ *			restore error handler and exit.
+ *		}
+ *
+ * ! Medium length cases with at least CHKSIZE bytes available
+ * ! method: line up src and dst as best possible, then
+ * ! move data in 4-way unrolled loops.
+ *
+ * sm_med:
+ *	if(src&dst unalignable)
+ * 		go to sm_movebytes
+ *	if(src&dst halfword alignable)
+ *		go to sm_movehalf
+ *	if(src&dst word alignable)
+ *		go to sm_moveword
+ * ! fall into long word movement
+ *	move bytes until src is word aligned
+ *	if not long word aligned, move a word
+ *	move long words in 4-way unrolled loop until < 32 bytes left
+ *      move long words in 1-way unrolled loop until < 8 bytes left
+ *	if zero bytes left, goto sm_exit
+ *	if one byte left, go to sm_byte
+ *	else go to sm_half
+ *
+ * sm_moveword:
+ *	move bytes until src is word aligned
+ *	move words in 4-way unrolled loop until < 16 bytes left
+ *      move words in 1-way unrolled loop until < 4 bytes left
+ *	if zero bytes left, goto sm_exit
+ *	if one byte left, go to sm_byte
+ *	else go to sm_half
+ *
+ * sm_movehalf:
+ *	move a byte if needed to align src on halfword
+ *	move halfwords in 4-way unrolled loop until < 8 bytes left
+ *	if zero bytes left, goto sm_exit
+ *	if one byte left, go to sm_byte
+ *	else go to sm_half
+ *
+ *
+ * FPBLK_copy:
+ * 	%l6 = curthread->t_lofault;
+ * 	if (%l6 != NULL) {
+ * 		membar #Sync
+ * 		curthread->t_lofault = .copyerr;
+ * 		caller_error_handler = TRUE             ! %l6 |= 2
+ * 	}
+ *
+ *	! for FPU testing we must not migrate cpus
+ * 	if (curthread->t_lwp == NULL) {
+ *		! Kernel threads do not have pcb's in which to store
+ *		! the floating point state, so disallow preemption during
+ *		! the copy.  This also prevents cpu migration.
+ * 		kpreempt_disable(curthread);
+ *	} else {
+ *		thread_nomigrate();
+ *	}
+ *
+ * 	old_fprs = %fprs;
+ * 	old_gsr = %gsr;
+ * 	if (%fprs.fef) {
+ * 		%fprs.fef = 1;
+ * 		save current fpregs on stack using blockstore
+ * 	} else {
+ * 		%fprs.fef = 1;
+ * 	}
+ *
+ *
+ * 	do_blockcopy_here;
+ *
+ * In lofault handler:
+ *	curthread->t_lofault = .copyerr2;
+ *	Continue on with the normal exit handler
+ *
+ * On normal exit:
+ * 	%gsr = old_gsr;
+ * 	if (old_fprs & FPRS_FEF)
+ * 		restore fpregs from stack using blockload
+ *	else
+ *		zero fpregs
+ * 	%fprs = old_fprs;
+ * 	membar #Sync
+ * 	curthread->t_lofault = (%l6 & ~3);
+ *	! following test omitted from copyin/copyout as they
+ *	! will always have a current thread
+ * 	if (curthread->t_lwp == NULL)
+ *		kpreempt_enable(curthread);
+ *	else
+ *		thread_allowmigrate();
+ * 	return (0)
+ *
+ * In second lofault handler (.copyerr2):
+ *	We've tried to restore fp state from the stack and failed.  To
+ *	prevent from returning with a corrupted fp state, we will panic.
+ */
+
+/*
+ * Comments about optimization choices
+ *
+ * The initial optimization decision in this code is to determine
+ * whether to use the FP registers for a copy or not.  If we don't
+ * use the FP registers, we can execute the copy as a leaf routine,
+ * saving a register save and restore.  Also, less elaborate setup
+ * is required, allowing short copies to be completed more quickly.
+ * For longer copies, especially unaligned ones (where the src and
+ * dst do not align to allow simple ldx,stx operation), the FP
+ * registers allow much faster copy operations.
+ *
+ * The estimated extra cost of the FP path will vary depending on
+ * src/dst alignment, dst offset from the next 64 byte FPblock store
+ * boundary, remaining src data after the last full dst cache line is
+ * moved whether the FP registers need to be saved, and some other
+ * minor issues.  The average additional overhead is estimated to be
+ * 400 clocks.  Since each non-repeated/predicted tst and branch costs
+ * around 10 clocks, elaborate calculation would slow down to all 
+ * longer copies and only benefit a small portion of medium sized
+ * copies.  Rather than incur such cost, we chose fixed transition
+ * points for each of the alignment choices.
+ *
+ * For the inner loop, here is a comparison of the per cache line
+ * costs for each alignment when src&dst are in cache:
+ * 
+ * byte aligned:  108 clocks slower for non-FPBLK
+ * half aligned:   44 clocks slower for non-FPBLK
+ * word aligned:   12 clocks slower for non-FPBLK
+ * long aligned:    4 clocks >>faster<< for non-FPBLK
+ *
+ * The long aligned loop runs faster because it does no prefetching.
+ * That wins if the data is not in cache or there is too little
+ * data to gain much benefit from prefetching.  But when there
+ * is more data and that data is not in cache, failing to prefetch
+ * can run much slower.  In addition, there is a 2 Kbyte store queue
+ * which will cause the non-FPBLK inner loop to slow for larger copies.
+ * The exact tradeoff is strongly load and application dependent, with
+ * increasing risk of a customer visible performance regression if the
+ * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
+ * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
+ * upper limit for the non-FPBLK code.  To minimize performance regression
+ * risk while still gaining the primary benefits of the improvements to 
+ * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
+ * hw_copy_limit_*.  Later experimental studies using different values 
+ * of hw_copy_limit_* can be used to make further adjustments if 
+ * appropriate.
+ *
+ * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
+ * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
+ * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
+ * hw_copy_limit_8 = src and dst are longword aligned
+ *
+ * To say that src and dst are word aligned means that after
+ * some initial alignment activity of moving 0 to 3 bytes,
+ * both the src and dst will be on word boundaries so that
+ * word loads and stores may be used.
+ *
+ * Recommended initial values as of Mar 2004, includes testing
+ * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
+ * hw_copy_limit_1 =  256
+ * hw_copy_limit_2 =  512
+ * hw_copy_limit_4 = 1024
+ * hw_copy_limit_8 = 1024 (or 1536 on some systems)
+ *
+ *
+ * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
+ * disabled for that alignment choice.
+ * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
+ * the value of VIS_COPY_THRESHOLD is used.
+ * It is not envisioned that hw_copy_limit_? will be changed in the field
+ * It is provided to allow for disabling FPBLK copies and to allow
+ * easy testing of alternate values on future HW implementations
+ * that might have different cache sizes, clock rates or instruction
+ * timing rules.
+ *
+ * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
+ * threshold to speedup all shorter copies (less than 256).  That
+ * saves an alignment test, memory reference, and enabling test
+ * for all short copies, or an estimated 24 clocks.
+ *
+ * The order in which these limits are checked does matter since each
+ * non-predicted tst and branch costs around 10 clocks.
+ * If src and dst are randomly selected addresses,
+ * 4 of 8 will not be alignable.
+ * 2 of 8 will be half word alignable.
+ * 1 of 8 will be word alignable.
+ * 1 of 8 will be long word alignable.
+ * But, tests on running kernels show that src and dst to copy code
+ * are typically not on random alignments.  Structure copies and
+ * copies of larger data sizes are often on long word boundaries.
+ * So we test the long word alignment case first, then
+ * the byte alignment, then halfword, then word alignment.
+ *
+ * Several times, tests for length are made to split the code
+ * into subcases.  These tests often allow later tests to be
+ * avoided.  For example, within the non-FPBLK copy, we first 
+ * check for tiny copies of 3 bytes or less.  That allows us
+ * to use a 4-way unrolled loop for the general byte copy case
+ * without a test on loop entry.
+ * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
+ * vs longer cases.  For the really short case, we don't attempt
+ * align src and dst.  We try to minimize special case tests in
+ * the shortest loops as each test adds a significant percentage
+ * to the total time.
+ *
+ * For the medium sized cases, we allow ourselves to adjust the
+ * src and dst alignment and provide special cases for each of
+ * the four adjusted alignment cases. The CHKSIZE that was used
+ * to decide between short and medium size was chosen to be 39
+ * as that allows for the worst case of 7 bytes of alignment
+ * shift and 4 times 8 bytes for the first long word unrolling.
+ * That knowledge saves an initial test for length on entry into
+ * the medium cases.  If the general loop unrolling factor were
+ * to be increases, this number would also need to be adjusted.
+ *
+ * For all cases in the non-FPBLK code where it is known that at
+ * least 4 chunks of data are available for movement, the
+ * loop is unrolled by four.  This 4-way loop runs in 8 clocks
+ * or 2 clocks per data element.  Due to limitations of the
+ * branch instruction on Cheetah, Jaguar, and Panther, the
+ * minimum time for a small, tight loop is 3 clocks.  So
+ * the 4-way loop runs 50% faster than the fastest non-unrolled
+ * loop.
+ *
+ * Instruction alignment is forced by used of .align 16 directives
+ * and nops which are not executed in the code.  This
+ * combination of operations shifts the alignment of following
+ * loops to insure that loops are aligned so that their instructions
+ * fall within the minimum number of 4 instruction fetch groups. 
+ * If instructions are inserted or removed between the .align 
+ * instruction and the unrolled loops, then the alignment needs
+ * to be readjusted.  Misaligned loops can add a clock per loop
+ * iteration to the loop timing.
+ *
+ * In a few cases, code is duplicated to avoid a branch.  Since
+ * a non-predicted tst and branch takes 10 clocks, this savings
+ * is judged an appropriate time-space tradeoff.
+ *
+ * Within the FPBLK-code, the prefetch method in the inner
+ * loop needs to be explained as it is not standard.  Two 
+ * prefetches are issued for each cache line instead of one.
+ * The primary one is at the maximum reach of 8 cache lines.
+ * Most of the time, that maximum prefetch reach gives the
+ * cache line more time to reach the processor for systems with
+ * higher processor clocks.  But, sometimes memory interference
+ * can cause that prefetch to be dropped.  Putting a second
+ * prefetch at a reach of 5 cache lines catches the drops
+ * three iterations later and shows a measured improvement
+ * in performance over any similar loop with a single prefetch.
+ * The prefetches are placed in the loop so they overlap with 
+ * non-memory instructions, so that there is no extra cost 
+ * when the data is already in-cache.
+ *
+ */
+
+/*
+ * Notes on preserving existing fp state and on membars.
+ *
+ * When a copyOP decides to use fp we may have to preserve existing
+ * floating point state.  It is not the caller's state that we need to
+ * preserve - the rest of the kernel does not use fp and, anyway, fp
+ * registers are volatile across a call.  Some examples:
+ *
+ *	- userland has fp state and is interrupted (device interrupt 
+ *	  or trap) and within the interrupt/trap handling we use
+ *	  bcopy()
+ *	- another (higher level) interrupt or trap handler uses bcopy
+ *	  while a bcopy from an earlier interrupt is still active
+ *	- an asynchronous error trap occurs while fp state exists (in
+ *	  userland or in kernel copy) and the tl0 component of the handling
+ *	  uses bcopy
+ *	- a user process with fp state incurs a copy-on-write fault and
+ *	  hwblkpagecopy always uses fp
+ *
+ * We therefore need a per-call place in which to preserve fp state -
+ * using our stack is ideal (and since fp copy cannot be leaf optimized
+ * because of calls it makes, this is no hardship).
+ *
+ * The following membar BLD/BST discussion is Cheetah pipeline specific.
+ * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
+ * nops (those semantics always apply) and #StoreLoad is implemented
+ * as a membar #Sync.
+ *
+ * It is possible that the owner of the fp state has a block load or
+ * block store still "in flight" at the time we come to preserve that
+ * state.  Block loads are blocking in Cheetah pipelines so we do not
+ * need to sync with them.  In preserving fp regs we will use block stores
+ * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
+ * after storing state (so that our subsequent use of those registers
+ * does not modify them before the block stores complete);  this membar
+ * also serves to sync with block stores the owner of the fp state has
+ * initiated.
+ *
+ * When we have finished fp copy (with it's repeated block stores)
+ * we must membar #Sync so that our block stores may complete before
+ * we either restore the original fp state into the fp registers or
+ * return to a caller which may initiate other fp operations that could
+ * modify the fp regs we used before the block stores complete.
+ *
+ * Synchronous faults (eg, unresolvable DMMU miss) that occur while
+ * t_lofault is not NULL will not panic but will instead trampoline
+ * to the registered lofault handler.  There is no need for any
+ * membars for these - eg, our store to t_lofault will always be visible to
+ * ourselves and it is our cpu which will take any trap.
+ *
+ * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
+ * while t_lofault is not NULL will also not panic.  Since we're copying
+ * to or from userland the extent of the damage is known - the destination
+ * buffer is incomplete.  So trap handlers will trampoline to the lofault
+ * handler in this case which should take some form of error action to
+ * avoid using the incomplete buffer.  The trap handler also flags the
+ * fault so that later return-from-trap handling (for the trap that brought
+ * this thread into the kernel in the first place) can notify the process
+ * and reboot the system (or restart the service with Greenline/Contracts).
+ *
+ * Asynchronous faults (eg, uncorrectable ECC error from memory) can
+ * result in deferred error traps - the trap is taken sometime after
+ * the event and the trap PC may not be the PC of the faulting access.
+ * Delivery of such pending traps can be forced by a membar #Sync, acting
+ * as an "error barrier" in this role.  To accurately apply the user/kernel
+ * separation described in the preceding paragraph we must force delivery
+ * of deferred traps affecting kernel state before we install a lofault
+ * handler (if we interpose a new lofault handler on an existing one there
+ * is no need to repeat this), and we must force delivery of deferred
+ * errors affecting the lofault-protected region before we clear t_lofault.
+ * Failure to do so results in lost kernel state being interpreted as
+ * affecting a copyin/copyout only, or of an error that really only
+ * affects copy data being interpreted as losing kernel state.
+ *
+ * Since the copy operations may preserve and later restore floating
+ * point state that does not belong to the caller (see examples above),
+ * we must be careful in how we do this in order to prevent corruption
+ * of another program.
+ *
+ * To make sure that floating point state is always saved and restored
+ * correctly, the following "big rules" must be followed when the floating
+ * point registers will be used:
+ *
+ * 1. %l6 always holds the caller's lofault handler.  Also in this register,
+ *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
+ *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
+ *    lofault handler was set coming in.
+ *
+ * 2. The FPUSED flag indicates that all FP state has been successfully stored
+ *    on the stack.  It should not be set until this save has been completed.
+ *
+ * 3. The FPUSED flag should not be cleared on exit until all FP state has
+ *    been restored from the stack.  If an error occurs while restoring
+ *    data from the stack, the error handler can check this flag to see if
+ *    a restore is necessary.
+ *
+ * 4. Code run under the new lofault handler must be kept to a minimum.  In
+ *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
+ *    to kpreempt(), should not be made until after the lofault handler has
+ *    been restored.
+ */
+
+/*
+ * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
+ * to "break even" using FP/VIS-accelerated memory operations.
+ * The FPBLK code assumes a minimum number of bytes are available
+ * to be moved on entry.  Check that code carefully before 
+ * reducing VIS_COPY_THRESHOLD below 256.
+ */
+/*
+ * This shadows sys/machsystm.h which can't be included due to the lack of
+ * _ASM guards in include files it references. Change it here, change it there.
+ */
+#define VIS_COPY_THRESHOLD 256
+
+/*
+ * TEST for very short copies
+ * Be aware that the maximum unroll for the short unaligned case
+ * is SHORTCOPY+1
+ */
+#define SHORTCOPY 3
+#define CHKSIZE  39
+
+/*
+ * Indicates that we're to trampoline to the error handler.
+ * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
+ * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
+ */
+#define	FPUSED_FLAG	1
+#define	TRAMP_FLAG	2
+#define	MASK_FLAGS	3
+
+/*
+ * Number of outstanding prefetches.
+ * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
+ * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
+ * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
+ * of 5% for large copies as compared to a single prefetch.  The reason
+ * for the improvement is that with Cheetah and Jaguar, some prefetches
+ * are dropped due to the prefetch queue being full.  The second prefetch
+ * reduces the number of cache lines that are dropped. 
+ * Do not remove the double prefetch or change either CHEETAH_PREFETCH
+ * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
+ * there is no loss of performance.
+ */
+#define	CHEETAH_PREFETCH	8
+#define	CHEETAH_2ND_PREFETCH	5
+
+#define	VIS_BLOCKSIZE		64
+
+/*
+ * Size of stack frame in order to accomodate a 64-byte aligned
+ * floating-point register save area and 2 64-bit temp locations.
+ * All copy functions use two quadrants of fp registers; to assure a
+ * block-aligned two block buffer in which to save we must reserve
+ * three blocks on stack.  Not all functions preserve %pfrs on stack
+ * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
+ *
+ *    _______________________________________ <-- %fp + STACK_BIAS
+ *    | We may need to preserve 2 quadrants |
+ *    | of fp regs, but since we do so with |
+ *    | BST/BLD we need room in which to    |
+ *    | align to VIS_BLOCKSIZE bytes.  So   |
+ *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
+ *    |-------------------------------------|
+ *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
+ *    |-------------------------------------|
+ *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
+ *    ---------------------------------------
+ */
+#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
+#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
+#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
+#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
+#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
+
+/*
+ * Common macros used by the various versions of the block copy
+ * routines in this file.
+ */
+
+/*
+ * In FP copies if we do not have preserved data to restore over
+ * the fp regs we used then we must zero those regs to avoid
+ * exposing portions of the data to later threads (data security).
+ *
+ * Copy functions use either quadrants 1 and 3 or 2 and 4.
+ *
+ * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
+ * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
+ *
+ * The instructions below are quicker than repeated fzero instructions
+ * since they can dispatch down two fp pipelines.
+ */
+#define	FZEROQ1Q3			\
+	fzero	%f0			;\
+	fzero	%f2			;\
+	faddd	%f0, %f2, %f4		;\
+	fmuld	%f0, %f2, %f6		;\
+	faddd	%f0, %f2, %f8		;\
+	fmuld	%f0, %f2, %f10		;\
+	faddd	%f0, %f2, %f12		;\
+	fmuld	%f0, %f2, %f14		;\
+	faddd	%f0, %f2, %f32		;\
+	fmuld	%f0, %f2, %f34		;\
+	faddd	%f0, %f2, %f36		;\
+	fmuld	%f0, %f2, %f38		;\
+	faddd	%f0, %f2, %f40		;\
+	fmuld	%f0, %f2, %f42		;\
+	faddd	%f0, %f2, %f44		;\
+	fmuld	%f0, %f2, %f46
+
+#define	FZEROQ2Q4			\
+	fzero	%f16			;\
+	fzero	%f18			;\
+	faddd	%f16, %f18, %f20	;\
+	fmuld	%f16, %f18, %f22	;\
+	faddd	%f16, %f18, %f24	;\
+	fmuld	%f16, %f18, %f26	;\
+	faddd	%f16, %f18, %f28	;\
+	fmuld	%f16, %f18, %f30	;\
+	faddd	%f16, %f18, %f48	;\
+	fmuld	%f16, %f18, %f50	;\
+	faddd	%f16, %f18, %f52	;\
+	fmuld	%f16, %f18, %f54	;\
+	faddd	%f16, %f18, %f56	;\
+	fmuld	%f16, %f18, %f58	;\
+	faddd	%f16, %f18, %f60	;\
+	fmuld	%f16, %f18, %f62
+
+/*
+ * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
+ * Used to save and restore in-use fp registers when we want to use FP
+ * and find fp already in use and copy size still large enough to justify
+ * the additional overhead of this save and restore.
+ *
+ * A membar #Sync is needed before save to sync fp ops initiated before
+ * the call to the copy function (by whoever has fp in use); for example
+ * an earlier block load to the quadrant we are about to save may still be
+ * "in flight".  A membar #Sync is required at the end of the save to
+ * sync our block store (the copy code is about to begin ldd's to the
+ * first quadrant).  Note, however, that since Cheetah pipeline block load
+ * is blocking we can omit the initial membar before saving fp state (they're
+ * commented below in case of future porting to a chip that does not block
+ * on block load).
+ *
+ * Similarly: a membar #Sync before restore allows the block stores of
+ * the copy operation to complete before we fill the quadrants with their
+ * original data, and a membar #Sync after restore lets the block loads
+ * of the restore complete before we return to whoever has the fp regs
+ * in use.  To avoid repeated membar #Sync we make it the responsibility
+ * of the copy code to membar #Sync immediately after copy is complete
+ * and before using the BLD_*_FROMSTACK macro.
+ */
+#if !defined(lint)
+#define BST_FPQ1Q3_TOSTACK(tmp1)				\
+	/* membar #Sync	*/					;\
+	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
+	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
+	stda	%f0, [tmp1]ASI_BLK_P				;\
+	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
+	stda	%f32, [tmp1]ASI_BLK_P				;\
+	membar	#Sync
+
+#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
+	/* membar #Sync - provided at copy completion */	;\
+	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
+	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
+	ldda	[tmp1]ASI_BLK_P, %f0				;\
+	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
+	ldda	[tmp1]ASI_BLK_P, %f32				;\
+	membar	#Sync
+
+#define BST_FPQ2Q4_TOSTACK(tmp1)				\
+	/* membar #Sync */					;\
+	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
+	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
+	stda	%f16, [tmp1]ASI_BLK_P				;\
+	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
+	stda	%f48, [tmp1]ASI_BLK_P				;\
+	membar	#Sync
+
+#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
+	/* membar #Sync - provided at copy completion */	;\
+	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
+	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
+	ldda	[tmp1]ASI_BLK_P, %f16				;\
+	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
+	ldda	[tmp1]ASI_BLK_P, %f48				;\
+	membar	#Sync
+#endif
+
+/*
+ * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
+ * prevent preemption if there is no t_lwp to save FP state to on context
+ * switch) before commencing a FP copy, and reallow it on completion or
+ * in error trampoline paths when we were using FP copy.
+ *
+ * Both macros may call other functions, so be aware that all outputs are
+ * forfeit after using these macros.  For this reason we do not pass registers
+ * to use - we just use any outputs we want.
+ *
+ * For fpRAS we need to perform the fpRAS mechanism test on the same
+ * CPU as we use for the copy operation, both so that we validate the
+ * CPU we perform the copy on and so that we know which CPU failed
+ * if a failure is detected.  Hence we need to be bound to "our" CPU.
+ * This could be achieved through disabling preemption (and we have do it that
+ * way for threads with no t_lwp) but for larger copies this may hold
+ * higher priority threads off of cpu for too long (eg, realtime).  So we
+ * make use of the lightweight t_nomigrate mechanism where we can (ie, when
+ * we have a t_lwp).
+ *
+ * Pseudo code:
+ *
+ * FP_NOMIGRATE:
+ *
+ * if (curthread->t_lwp) {
+ *	thread_nomigrate();
+ * } else {
+ *	kpreempt_disable();
+ * }
+ *
+ * FP_ALLOWMIGRATE:
+ *
+ * if (curthread->t_lwp) {
+ *	thread_allowmigrate();
+ * } else {
+ *	kpreempt_enable();
+ * }
+ */
+
+#define	FP_NOMIGRATE(label1, label2)				\
+	ldn	[THREAD_REG + T_LWP], %o0			;\
+	brz,a,pn %o0, label1/**/f				;\
+	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
+	call	thread_nomigrate				;\
+	  nop							;\
+	ba	label2/**/f					;\
+	  nop							;\
+label1:								;\
+	inc	%o1						;\
+	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
+label2:
+
+#define	FP_ALLOWMIGRATE(label1, label2)			\
+	ldn	[THREAD_REG + T_LWP], %o0			;\
+	brz,a,pn %o0, label1/**/f				;\
+	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
+	call thread_allowmigrate				;\
+	  nop							;\
+	ba	label2/**/f					;\
+	  nop							;\
+label1:								;\
+	dec	%o1						;\
+	brnz,pn	%o1, label2/**/f				;\
+	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
+	ldn	[THREAD_REG + T_CPU], %o0			;\
+	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
+	brz,pt	%o0, label2/**/f				;\
+	  nop							;\
+	call	kpreempt					;\
+	  rdpr	%pil, %o0					;\
+label2:
+
+/*
+ * Copy a block of storage, returning an error code if `from' or
+ * `to' takes a kernel pagefault which cannot be resolved.
+ * Returns errno value on pagefault error, 0 if all ok
+ */
+
+#if defined(lint)
+
+/* ARGSUSED */
+int
+kcopy(const void *from, void *to, size_t count)
+{ return(0); }
+
+#else	/* lint */
+
+	.seg	".text"
+	.align	4
+
+	ENTRY(kcopy)
+
+	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
+	bleu,pt	%ncc, .kcopy_small		! go to larger cases
+	  xor	%o0, %o1, %o3			! are src, dst alignable?
+	btst	7, %o3				!
+	bz,pt	%ncc, .kcopy_8			! check for longword alignment
+	  nop
+	btst	1, %o3				! 
+	bz,pt	%ncc, .kcopy_2			! check for half-word
+	  nop
+	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	tst	%o3
+	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .kcopy_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
+	  nop
+.kcopy_2:
+	btst	3, %o3				!
+	bz,pt	%ncc, .kcopy_4			! check for word alignment
+	  nop
+	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .kcopy_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
+	  nop
+.kcopy_4:
+	! already checked longword, must be word aligned
+	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	tst	%o3
+	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .kcopy_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
+	  nop
+.kcopy_8:
+	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	tst	%o3
+	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .kcopy_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
+	  nop
+
+.kcopy_small:
+	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
+	or	%o5, %lo(.sm_copyerr), %o5
+	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
+	membar	#Sync				! sync error barrier
+	ba,pt	%ncc, .sm_do_copy		! common code
+	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
+
+.kcopy_more:
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
+	or	%l7, %lo(.copyerr), %l7
+	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
+	membar	#Sync				! sync error barrier
+	ba,pt	%ncc, .do_copy			! common code
+	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
+
+
+/*
+ * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
+ * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
+ */
+.copyerr:
+	set	.copyerr2, %l0
+	membar	#Sync				! sync error barrier
+	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
+	btst	FPUSED_FLAG, %l6
+	bz	%ncc, 1f
+	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
+
+	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
+	wr	%o2, 0, %gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz,pt	%icc, 4f
+	  nop
+
+	BLD_FPQ1Q3_FROMSTACK(%o2)
+
+	ba,pt	%ncc, 1f
+	  wr	%o3, 0, %fprs		! restore fprs
+
+4:
+	FZEROQ1Q3
+	wr	%o3, 0, %fprs		! restore fprs
+
+	!
+	! Need to cater for the different expectations of kcopy
+	! and bcopy. kcopy will *always* set a t_lofault handler
+	! If it fires, we're expected to just return the error code
+	! and *not* to invoke any existing error handler. As far as
+	! bcopy is concerned, we only set t_lofault if there was an
+	! existing lofault handler. In that case we're expected to
+	! invoke the previously existing handler after resetting the
+	! t_lofault value.
+	!
+1:
+	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
+	membar	#Sync				! sync error barrier
+	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	FP_ALLOWMIGRATE(5, 6)
+
+	btst	TRAMP_FLAG, %l0
+	bnz,pn	%ncc, 3f
+	  nop
+	ret
+	  restore	%g1, 0, %o0
+
+3:
+	!
+	! We're here via bcopy. There *must* have been an error handler
+	! in place otherwise we would have died a nasty death already.
+	!
+	jmp	%l6				! goto real handler
+	  restore	%g0, 0, %o0		! dispose of copy window
+
+/*
+ * We got here because of a fault in .copyerr.  We can't safely restore fp
+ * state, so we panic.
+ */
+fp_panic_msg:
+	.asciz	"Unable to restore fp state after copy operation"
+
+	.align	4
+.copyerr2:
+	set	fp_panic_msg, %o0
+	call	panic
+	  nop
+
+/*
+ * We got here because of a fault during a small kcopy or bcopy.
+ * No floating point registers are used by the small copies.
+ * Errno value is in %g1.
+ */
+.sm_copyerr:
+1:
+	btst	TRAMP_FLAG, %o4
+	membar	#Sync
+	andn	%o4, TRAMP_FLAG, %o4
+	bnz,pn	%ncc, 3f
+	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g1, %o0
+3:
+	jmp	%o4				! goto real handler
+	  mov	%g0, %o0			! 
+
+	SET_SIZE(kcopy)
+#endif	/* lint */
+
+
+/*
+ * Copy a block of storage - must not overlap (from + len <= to).
+ * Registers: l6 - saved t_lofault
+ * (for short copies, o4 - saved t_lofault)
+ *
+ * Copy a page of memory.
+ * Assumes double word alignment and a count >= 256.
+ */
+#if defined(lint)
+
+/* ARGSUSED */
+void
+bcopy(const void *from, void *to, size_t count)
+{}
+
+#else	/* lint */
+
+	ENTRY(bcopy)
+
+	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
+	bleu,pt	%ncc, .bcopy_small		! go to larger cases
+	  xor	%o0, %o1, %o3			! are src, dst alignable?
+	btst	7, %o3				!
+	bz,pt	%ncc, .bcopy_8			! check for longword alignment
+	  nop
+	btst	1, %o3				! 
+	bz,pt	%ncc, .bcopy_2			! check for half-word
+	  nop
+	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	tst	%o3
+	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .bcopy_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
+	  nop
+.bcopy_2:
+	btst	3, %o3				!
+	bz,pt	%ncc, .bcopy_4			! check for word alignment
+	  nop
+	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .bcopy_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
+	  nop
+.bcopy_4:
+	! already checked longword, must be word aligned
+	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	tst	%o3
+	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .bcopy_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
+	  nop
+.bcopy_8:
+	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	tst	%o3
+	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .bcopy_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
+	  nop
+
+	.align	16
+.bcopy_small:
+	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
+	tst	%o4
+	bz,pt	%icc, .sm_do_copy
+	  nop
+	sethi	%hi(.sm_copyerr), %o5
+	or	%o5, %lo(.sm_copyerr), %o5
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
+	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
+.sm_do_copy:
+	cmp	%o2, SHORTCOPY		! check for really short case
+	bleu,pt	%ncc, .bc_sm_left	!
+	  cmp	%o2, CHKSIZE		! check for medium length cases
+	bgu,pn	%ncc, .bc_med		!
+	  or	%o0, %o1, %o3		! prepare alignment check
+	andcc	%o3, 0x3, %g0		! test for alignment
+	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
+.bc_sm_movebytes:
+	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
+.bc_sm_notalign4:
+	ldub	[%o0], %o3		! read byte
+	stb	%o3, [%o1]		! write byte
+	subcc	%o2, 4, %o2		! reduce count by 4
+	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
+	add	%o0, 4, %o0		! advance SRC by 4
+	stb	%o3, [%o1 + 1]
+	ldub	[%o0 - 2], %o3
+	add	%o1, 4, %o1		! advance DST by 4
+	stb	%o3, [%o1 - 2]
+	ldub	[%o0 - 1], %o3
+	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
+	  stb	%o3, [%o1 - 1]
+	add	%o2, 3, %o2		! restore count
+.bc_sm_left:
+	tst	%o2
+	bz,pt	%ncc, .bc_sm_exit	! check for zero length
+	  deccc	%o2			! reduce count for cc test
+	ldub	[%o0], %o3		! move one byte
+	bz,pt	%ncc, .bc_sm_exit
+	  stb	%o3, [%o1]
+	ldub	[%o0 + 1], %o3		! move another byte
+	deccc	%o2			! check for more
+	bz,pt	%ncc, .bc_sm_exit
+	  stb	%o3, [%o1 + 1]
+	ldub	[%o0 + 2], %o3		! move final byte
+	stb	%o3, [%o1 + 2]
+	membar	#Sync				! sync error barrier
+	andn	%o4, TRAMP_FLAG, %o4
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+	.align	16
+	nop				! instruction alignment
+					! see discussion at start of file
+.bc_sm_words:
+	lduw	[%o0], %o3		! read word
+.bc_sm_wordx:
+	subcc	%o2, 8, %o2		! update count
+	stw	%o3, [%o1]		! write word
+	add	%o0, 8, %o0		! update SRC
+	lduw	[%o0 - 4], %o3		! read word
+	add	%o1, 8, %o1		! update DST
+	bgt,pt	%ncc, .bc_sm_words	! loop til done
+	  stw	%o3, [%o1 - 4]		! write word
+	addcc	%o2, 7, %o2		! restore count
+	bz,pt	%ncc, .bc_sm_exit
+	  deccc	%o2
+	bz,pt	%ncc, .bc_sm_byte
+.bc_sm_half:
+	  subcc	%o2, 2, %o2		! reduce count by 2
+	add	%o0, 2, %o0		! advance SRC by 2
+	lduh	[%o0 - 2], %o3		! read half word
+	add	%o1, 2, %o1		! advance DST by 2
+	bgt,pt	%ncc, .bc_sm_half	! loop til done
+	  sth	%o3, [%o1 - 2]		! write half word
+	addcc	%o2, 1, %o2		! restore count
+	bz,pt	%ncc, .bc_sm_exit
+	  nop
+.bc_sm_byte:
+	ldub	[%o0], %o3
+	stb	%o3, [%o1]
+	membar	#Sync				! sync error barrier
+	andn	%o4, TRAMP_FLAG, %o4
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+
+.bc_sm_word:
+	subcc	%o2, 4, %o2		! update count
+	bgt,pt	%ncc, .bc_sm_wordx
+	  lduw	[%o0], %o3		! read word
+	addcc	%o2, 3, %o2		! restore count
+	bz,pt	%ncc, .bc_sm_exit
+	  stw	%o3, [%o1]		! write word
+	deccc	%o2			! reduce count for cc test
+	ldub	[%o0 + 4], %o3		! load one byte
+	bz,pt	%ncc, .bc_sm_exit
+	  stb	%o3, [%o1 + 4]		! store one byte
+	ldub	[%o0 + 5], %o3		! load second byte
+	deccc	%o2
+	bz,pt	%ncc, .bc_sm_exit
+	  stb	%o3, [%o1 + 5]		! store second byte
+	ldub	[%o0 + 6], %o3		! load third byte
+	stb	%o3, [%o1 + 6]		! store third byte
+.bc_sm_exit:
+	membar	#Sync				! sync error barrier
+	andn	%o4, TRAMP_FLAG, %o4
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+
+	.align 16
+.bc_med:
+	xor	%o0, %o1, %o3		! setup alignment check
+	btst	1, %o3
+	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
+	  nop
+	btst	3, %o3
+	bnz,pt	%ncc, .bc_med_half	! halfword aligned
+	  nop
+	btst	7, %o3
+	bnz,pt	%ncc, .bc_med_word	! word aligned
+	  nop
+.bc_med_long:
+	btst	3, %o0			! check for
+	bz,pt	%ncc, .bc_med_long1	! word alignment
+	  nop
+.bc_med_long0:
+	ldub	[%o0], %o3		! load one byte
+	inc	%o0
+	stb	%o3,[%o1]		! store byte
+	inc	%o1
+	btst	3, %o0
+	bnz,pt	%ncc, .bc_med_long0
+	  dec	%o2
+.bc_med_long1:			! word aligned
+	btst	7, %o0			! check for long word
+	bz,pt	%ncc, .bc_med_long2
+	  nop
+	lduw	[%o0], %o3		! load word
+	add	%o0, 4, %o0		! advance SRC by 4
+	stw	%o3, [%o1]		! store word
+	add	%o1, 4, %o1		! advance DST by 4
+	sub	%o2, 4, %o2		! reduce count by 4
+!
+!  Now long word aligned and have at least 32 bytes to move
+!
+.bc_med_long2:
+	sub	%o2, 31, %o2		! adjust count to allow cc zero test
+.bc_med_lmove:
+	ldx	[%o0], %o3		! read long word
+	stx	%o3, [%o1]		! write long word
+	subcc	%o2, 32, %o2		! reduce count by 32
+	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
+	add	%o0, 32, %o0		! advance SRC by 32
+	stx	%o3, [%o1 + 8]
+	ldx	[%o0 - 16], %o3
+	add	%o1, 32, %o1		! advance DST by 32
+	stx	%o3, [%o1 - 16]
+	ldx	[%o0 - 8], %o3
+	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
+	  stx	%o3, [%o1 - 8]
+	addcc	%o2, 24, %o2		! restore count to long word offset
+	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
+	  nop
+.bc_med_lword:
+	ldx	[%o0], %o3		! read long word
+	subcc	%o2, 8, %o2		! reduce count by 8
+	stx	%o3, [%o1]		! write long word
+	add	%o0, 8, %o0		! advance SRC by 8
+	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
+	  add	%o1, 8, %o1		! advance DST by 8
+.bc_med_lextra:
+	addcc	%o2, 7, %o2		! restore rest of count
+	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
+	  deccc	%o2
+	bz,pt	%ncc, .bc_sm_byte
+	  nop
+	ba,pt	%ncc, .bc_sm_half
+	  nop
+
+	.align 16
+.bc_med_word:
+	btst	3, %o0			! check for
+	bz,pt	%ncc, .bc_med_word1	! word alignment
+	  nop
+.bc_med_word0:
+	ldub	[%o0], %o3		! load one byte
+	inc	%o0
+	stb	%o3,[%o1]		! store byte
+	inc	%o1
+	btst	3, %o0
+	bnz,pt	%ncc, .bc_med_word0
+	  dec	%o2
+!
+!  Now word aligned and have at least 36 bytes to move
+!
+.bc_med_word1:
+	sub	%o2, 15, %o2		! adjust count to allow cc zero test
+.bc_med_wmove:
+	lduw	[%o0], %o3		! read word
+	stw	%o3, [%o1]		! write word
+	subcc	%o2, 16, %o2		! reduce count by 16
+	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
+	add	%o0, 16, %o0		! advance SRC by 16
+	stw	%o3, [%o1 + 4]
+	lduw	[%o0 - 8], %o3
+	add	%o1, 16, %o1		! advance DST by 16
+	stw	%o3, [%o1 - 8]
+	lduw	[%o0 - 4], %o3
+	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
+	  stw	%o3, [%o1 - 4]
+	addcc	%o2, 12, %o2		! restore count to word offset
+	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
+	  nop
+.bc_med_word2:
+	lduw	[%o0], %o3		! read word
+	subcc	%o2, 4, %o2		! reduce count by 4
+	stw	%o3, [%o1]		! write word
+	add	%o0, 4, %o0		! advance SRC by 4
+	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
+	  add	%o1, 4, %o1		! advance DST by 4
+.bc_med_wextra:
+	addcc	%o2, 3, %o2		! restore rest of count
+	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
+	  deccc	%o2
+	bz,pt	%ncc, .bc_sm_byte
+	  nop
+	ba,pt	%ncc, .bc_sm_half
+	  nop
+
+	.align 16
+.bc_med_half:
+	btst	1, %o0			! check for
+	bz,pt	%ncc, .bc_med_half1	! half word alignment
+	  nop
+	ldub	[%o0], %o3		! load one byte
+	inc	%o0
+	stb	%o3,[%o1]		! store byte
+	inc	%o1
+	dec	%o2
+!
+!  Now half word aligned and have at least 38 bytes to move
+!
+.bc_med_half1:
+	sub	%o2, 7, %o2		! adjust count to allow cc zero test
+.bc_med_hmove:
+	lduh	[%o0], %o3		! read half word
+	sth	%o3, [%o1]		! write half word
+	subcc	%o2, 8, %o2		! reduce count by 8
+	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
+	add	%o0, 8, %o0		! advance SRC by 8
+	sth	%o3, [%o1 + 2]
+	lduh	[%o0 - 4], %o3
+	add	%o1, 8, %o1		! advance DST by 8
+	sth	%o3, [%o1 - 4]
+	lduh	[%o0 - 2], %o3
+	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
+	  sth	%o3, [%o1 - 2]
+	addcc	%o2, 7, %o2		! restore count
+	bz,pt	%ncc, .bc_sm_exit
+	  deccc	%o2
+	bz,pt	%ncc, .bc_sm_byte
+	  nop
+	ba,pt	%ncc, .bc_sm_half
+	  nop
+
+	SET_SIZE(bcopy)
+
+/*
+ * The _more entry points are not intended to be used directly by
+ * any caller from outside this file.  They are provided to allow
+ * profiling and dtrace of the portions of the copy code that uses
+ * the floating point registers.
+ * This entry is particularly important as DTRACE (at least as of
+ * 4/2004) does not support leaf functions.
+ */
+
+	ENTRY(bcopy_more)
+.bcopy_more:		
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
+	tst	%l6
+	bz,pt	%ncc, .do_copy
+	  nop
+	sethi	%hi(.copyerr), %o2
+	or	%o2, %lo(.copyerr), %o2
+	membar	#Sync				! sync error barrier
+	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
+	!
+	! We've already captured whether t_lofault was zero on entry.
+	! We need to mark ourselves as being from bcopy since both
+	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
+	! and the saved lofault was zero, we won't reset lofault on
+	! returning.
+	!
+	or	%l6, TRAMP_FLAG, %l6
+
+/*
+ * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
+ * Also, use of FP registers has been tested to be enabled
+ */
+.do_copy:
+	FP_NOMIGRATE(6, 7)
+
+	rd	%fprs, %o2		! check for unused fp
+	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
+	btst	FPRS_FEF, %o2
+	bz,a,pt	%icc, .do_blockcopy
+	  wr	%g0, FPRS_FEF, %fprs
+
+	BST_FPQ1Q3_TOSTACK(%o2)
+
+.do_blockcopy:
+	rd	%gsr, %o2
+	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
+	or	%l6, FPUSED_FLAG, %l6
+
+#define	REALSRC	%i0
+#define	DST	%i1
+#define	CNT	%i2
+#define	SRC	%i3
+#define	TMP	%i5
+
+	andcc	DST, VIS_BLOCKSIZE - 1, TMP
+	bz,pt	%ncc, 2f
+	  neg	TMP
+	add	TMP, VIS_BLOCKSIZE, TMP
+
+	! TMP = bytes required to align DST on FP_BLOCK boundary
+	! Using SRC as a tmp here
+	cmp	TMP, 3
+	bleu,pt	%ncc, 1f
+	  sub	CNT,TMP,CNT		! adjust main count
+	sub	TMP, 3, TMP		! adjust for end of loop test
+.bc_blkalign:
+	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
+	stb	SRC, [DST]
+	subcc	TMP, 4, TMP
+	ldub	[REALSRC + 1], SRC
+	add	REALSRC, 4, REALSRC
+	stb	SRC, [DST + 1]
+	ldub	[REALSRC - 2], SRC
+	add	DST, 4, DST
+	stb	SRC, [DST - 2]
+	ldub	[REALSRC - 1], SRC
+	bgu,pt	%ncc, .bc_blkalign
+	  stb	SRC, [DST - 1]
+
+	addcc	TMP, 3, TMP		! restore count adjustment
+	bz,pt	%ncc, 2f		! no bytes left?
+	  nop
+1:	ldub	[REALSRC], SRC
+	inc	REALSRC
+	inc	DST
+	deccc	TMP
+	bgu	%ncc, 1b
+	  stb	SRC, [DST - 1]
+
+2:
+	andn	REALSRC, 0x7, SRC
+	alignaddr REALSRC, %g0, %g0
+
+	! SRC - 8-byte aligned
+	! DST - 64-byte aligned
+	prefetch [SRC], #one_read
+	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
+	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
+	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
+	ldd	[SRC], %f0
+#if CHEETAH_PREFETCH > 4
+	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
+#endif
+	ldd	[SRC + 0x08], %f2
+#if CHEETAH_PREFETCH > 5
+	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
+#endif
+	ldd	[SRC + 0x10], %f4
+#if CHEETAH_PREFETCH > 6
+	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
+#endif
+	faligndata %f0, %f2, %f32
+	ldd	[SRC + 0x18], %f6
+#if CHEETAH_PREFETCH > 7
+	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
+#endif
+	faligndata %f2, %f4, %f34
+	ldd	[SRC + 0x20], %f8
+	faligndata %f4, %f6, %f36
+	ldd	[SRC + 0x28], %f10
+	faligndata %f6, %f8, %f38
+	ldd	[SRC + 0x30], %f12
+	faligndata %f8, %f10, %f40
+	ldd	[SRC + 0x38], %f14
+	faligndata %f10, %f12, %f42
+	ldd	[SRC + VIS_BLOCKSIZE], %f0
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	SRC, VIS_BLOCKSIZE, SRC
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	ba,a,pt	%ncc, 1f
+	  nop
+	.align	16
+1:
+	ldd	[SRC + 0x08], %f2
+	faligndata %f12, %f14, %f44
+	ldd	[SRC + 0x10], %f4
+	faligndata %f14, %f0, %f46
+	stda	%f32, [DST]ASI_BLK_P
+	ldd	[SRC + 0x18], %f6
+	faligndata %f0, %f2, %f32
+	ldd	[SRC + 0x20], %f8
+	faligndata %f2, %f4, %f34
+	ldd	[SRC + 0x28], %f10
+	faligndata %f4, %f6, %f36
+	ldd	[SRC + 0x30], %f12
+	faligndata %f6, %f8, %f38
+	ldd	[SRC + 0x38], %f14
+	faligndata %f8, %f10, %f40
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	ldd	[SRC + VIS_BLOCKSIZE], %f0
+	faligndata %f10, %f12, %f42
+	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
+	add	DST, VIS_BLOCKSIZE, DST
+	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	cmp	CNT, VIS_BLOCKSIZE + 8
+	bgu,pt	%ncc, 1b
+	  add	SRC, VIS_BLOCKSIZE, SRC
+
+	! only if REALSRC & 0x7 is 0
+	cmp	CNT, VIS_BLOCKSIZE
+	bne	%ncc, 3f
+	  andcc	REALSRC, 0x7, %g0
+	bz,pt	%ncc, 2f
+	  nop
+3:	
+	faligndata %f12, %f14, %f44
+	faligndata %f14, %f0, %f46
+	stda	%f32, [DST]ASI_BLK_P
+	add	DST, VIS_BLOCKSIZE, DST
+	ba,pt	%ncc, 3f
+	  nop
+2:
+	ldd	[SRC + 0x08], %f2
+	fsrc1	%f12, %f44
+	ldd	[SRC + 0x10], %f4
+	fsrc1	%f14, %f46
+	stda	%f32, [DST]ASI_BLK_P
+	ldd	[SRC + 0x18], %f6
+	fsrc1	%f0, %f32
+	ldd	[SRC + 0x20], %f8
+	fsrc1	%f2, %f34
+	ldd	[SRC + 0x28], %f10
+	fsrc1	%f4, %f36
+	ldd	[SRC + 0x30], %f12
+	fsrc1	%f6, %f38
+	ldd	[SRC + 0x38], %f14
+	fsrc1	%f8, %f40
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	DST, VIS_BLOCKSIZE, DST
+	add	SRC, VIS_BLOCKSIZE, SRC
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	fsrc1	%f10, %f42
+	fsrc1	%f12, %f44
+	fsrc1	%f14, %f46
+	stda	%f32, [DST]ASI_BLK_P
+	add	DST, VIS_BLOCKSIZE, DST
+	ba,a,pt	%ncc, .bcb_exit
+	  nop
+
+3:	tst	CNT
+	bz,a,pt	%ncc, .bcb_exit
+	  nop
+
+5:	ldub	[REALSRC], TMP
+	inc	REALSRC
+	inc	DST
+	deccc	CNT
+	bgu	%ncc, 5b
+	  stb	TMP, [DST - 1]
+.bcb_exit:
+	membar	#Sync
+
+	FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
+	FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
+	FPRAS_CHECK(FPRAS_BCOPY, %l5, 9)	! outputs lost
+
+	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
+	wr	%o2, 0, %gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz,pt	%icc, 4f
+	  nop
+
+	BLD_FPQ1Q3_FROMSTACK(%o2)
+
+	ba,pt	%ncc, 2f	
+	  wr	%o3, 0, %fprs		! restore fprs
+4:
+	FZEROQ1Q3
+	wr	%o3, 0, %fprs		! restore fprs
+2:
+	membar	#Sync				! sync error barrier
+	andn	%l6, MASK_FLAGS, %l6
+	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	FP_ALLOWMIGRATE(5, 6)
+	ret
+	  restore	%g0, 0, %o0
+
+	SET_SIZE(bcopy_more)
+
+#endif	/* lint */
+
+/*
+ * Block copy with possibly overlapped operands.
+ */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+ovbcopy(const void *from, void *to, size_t count)
+{}
+
+#else	/* lint */
+
+	ENTRY(ovbcopy)
+	tst	%o2			! check count
+	bgu,a	%ncc, 1f		! nothing to do or bad arguments
+	  subcc	%o0, %o1, %o3		! difference of from and to address
+
+	retl				! return
+	  nop
+1:
+	bneg,a	%ncc, 2f
+	  neg	%o3			! if < 0, make it positive
+2:	cmp	%o2, %o3		! cmp size and abs(from - to)
+	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
+	  .empty				!   no overlap
+	  cmp	%o0, %o1		! compare from and to addresses
+	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
+	  nop
+	!
+	! Copy forwards.
+	!
+.ov_fwd:
+	ldub	[%o0], %o3		! read from address
+	inc	%o0			! inc from address
+	stb	%o3, [%o1]		! write to address
+	deccc	%o2			! dec count
+	bgu	%ncc, .ov_fwd		! loop till done
+	  inc	%o1			! inc to address
+
+	retl				! return
+	  nop
+	!
+	! Copy backwards.
+	!
+.ov_bkwd:
+	deccc	%o2			! dec count
+	ldub	[%o0 + %o2], %o3	! get byte at end of src
+	bgu	%ncc, .ov_bkwd		! loop till done
+	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
+
+	retl				! return
+	  nop
+
+	SET_SIZE(ovbcopy)
+
+#endif	/* lint */
+
+
+/*
+ * hwblkpagecopy()
+ *
+ * Copies exactly one page.  This routine assumes the caller (ppcopy)
+ * has already disabled kernel preemption and has checked
+ * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
+ */
+#ifdef lint
+/*ARGSUSED*/
+void
+hwblkpagecopy(const void *src, void *dst)
+{ }
+#else /* lint */
+	ENTRY(hwblkpagecopy)
+	! get another window w/space for three aligned blocks of saved fpregs
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+
+	! %i0 - source address (arg)
+	! %i1 - destination address (arg)
+	! %i2 - length of region (not arg)
+	! %l0 - saved fprs
+	! %l1 - pointer to saved fpregs
+
+	rd	%fprs, %l0		! check for unused fp
+	btst	FPRS_FEF, %l0
+	bz,a,pt	%icc, 1f
+	  wr	%g0, FPRS_FEF, %fprs
+
+	BST_FPQ1Q3_TOSTACK(%l1)
+
+1:	set	PAGESIZE, CNT
+	mov	REALSRC, SRC
+
+	prefetch [SRC], #one_read
+	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
+	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
+	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
+	ldd	[SRC], %f0
+#if CHEETAH_PREFETCH > 4
+	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
+#endif
+	ldd	[SRC + 0x08], %f2
+#if CHEETAH_PREFETCH > 5
+	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
+#endif
+	ldd	[SRC + 0x10], %f4
+#if CHEETAH_PREFETCH > 6
+	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
+#endif
+	fsrc1	%f0, %f32
+	ldd	[SRC + 0x18], %f6
+#if CHEETAH_PREFETCH > 7
+	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
+#endif
+	fsrc1	%f2, %f34
+	ldd	[SRC + 0x20], %f8
+	fsrc1	%f4, %f36
+	ldd	[SRC + 0x28], %f10
+	fsrc1	%f6, %f38
+	ldd	[SRC + 0x30], %f12
+	fsrc1	%f8, %f40
+	ldd	[SRC + 0x38], %f14
+	fsrc1	%f10, %f42
+	ldd	[SRC + VIS_BLOCKSIZE], %f0
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	SRC, VIS_BLOCKSIZE, SRC
+	ba,a,pt	%ncc, 2f
+	  nop
+	.align	16
+2:
+	ldd	[SRC + 0x08], %f2
+	fsrc1	%f12, %f44
+	ldd	[SRC + 0x10], %f4
+	fsrc1	%f14, %f46
+	stda	%f32, [DST]ASI_BLK_P
+	ldd	[SRC + 0x18], %f6
+	fsrc1	%f0, %f32
+	ldd	[SRC + 0x20], %f8
+	fsrc1	%f2, %f34
+	ldd	[SRC + 0x28], %f10
+	fsrc1	%f4, %f36
+	ldd	[SRC + 0x30], %f12
+	fsrc1	%f6, %f38
+	ldd	[SRC + 0x38], %f14
+	fsrc1	%f8, %f40
+	ldd	[SRC + VIS_BLOCKSIZE], %f0
+	fsrc1	%f10, %f42
+	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	DST, VIS_BLOCKSIZE, DST
+	cmp	CNT, VIS_BLOCKSIZE + 8
+	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
+	bgu,pt	%ncc, 2b
+	  add	SRC, VIS_BLOCKSIZE, SRC
+
+	! trailing block
+	ldd	[SRC + 0x08], %f2
+	fsrc1	%f12, %f44
+	ldd	[SRC + 0x10], %f4
+	fsrc1	%f14, %f46
+	stda	%f32, [DST]ASI_BLK_P
+	ldd	[SRC + 0x18], %f6
+	fsrc1	%f0, %f32
+	ldd	[SRC + 0x20], %f8
+	fsrc1	%f2, %f34
+	ldd	[SRC + 0x28], %f10
+	fsrc1	%f4, %f36
+	ldd	[SRC + 0x30], %f12
+	fsrc1	%f6, %f38
+	ldd	[SRC + 0x38], %f14
+	fsrc1	%f8, %f40
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	DST, VIS_BLOCKSIZE, DST
+	add	SRC, VIS_BLOCKSIZE, SRC
+	fsrc1	%f10, %f42
+	fsrc1	%f12, %f44
+	fsrc1	%f14, %f46
+	stda	%f32, [DST]ASI_BLK_P
+
+	membar	#Sync
+
+	FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
+	FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
+	FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9)	! lose outputs
+
+	btst	FPRS_FEF, %l0
+	bz,pt	%icc, 2f
+	  nop
+
+	BLD_FPQ1Q3_FROMSTACK(%l3)
+	ba	3f
+	  nop
+
+2:	FZEROQ1Q3
+
+3:	wr	%l0, 0, %fprs		! restore fprs
+	ret
+	  restore	%g0, 0, %o0
+
+	SET_SIZE(hwblkpagecopy)
+#endif	/* lint */
+
+
+/*
+ * Transfer data to and from user space -
+ * Note that these routines can cause faults
+ * It is assumed that the kernel has nothing at
+ * less than KERNELBASE in the virtual address space.
+ *
+ * Note that copyin(9F) and copyout(9F) are part of the
+ * DDI/DKI which specifies that they return '-1' on "errors."
+ *
+ * Sigh.
+ *
+ * So there's two extremely similar routines - xcopyin() and xcopyout()
+ * which return the errno that we've faithfully computed.  This
+ * allows other callers (e.g. uiomove(9F)) to work correctly.
+ * Given that these are used pretty heavily, we expand the calling
+ * sequences inline for all flavours (rather than making wrappers).
+ *
+ * There are also stub routines for xcopyout_little and xcopyin_little,
+ * which currently are intended to handle requests of <= 16 bytes from
+ * do_unaligned. Future enhancement to make them handle 8k pages efficiently
+ * is left as an exercise...
+ */
+
+/*
+ * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
+ *	
+ * General theory of operation:
+ *
+ * The only difference between copy{in,out} and
+ * xcopy{in,out} is in the error handling routine they invoke
+ * when a memory access error occurs. xcopyOP returns the errno
+ * while copyOP returns -1 (see above). copy{in,out}_noerr set
+ * a special flag (by oring the TRAMP_FLAG into the fault handler address)
+ * if they are called with a fault handler already in place. That flag
+ * causes the default handlers to trampoline to the previous handler
+ * upon an error.
+ *
+ * None of the copyops routines grab a window until it's decided that
+ * we need to do a HW block copy operation. This saves a window
+ * spill/fill when we're called during socket ops. The typical IO
+ * path won't cause spill/fill traps.
+ *
+ * This code uses a set of 4 limits for the maximum size that will
+ * be copied given a particular input/output address alignment.
+ * If the value for a particular limit is zero, the copy will be performed
+ * by the plain copy loops rather than FPBLK.
+ *
+ * See the description of bcopy above for more details of the
+ * data copying algorithm and the default limits.
+ *
+ */
+
+/*
+ * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
+ */
+
+#if defined(lint)
+
+
+#else	/* lint */
+/*
+ * We save the arguments in the following registers in case of a fault:
+ *	kaddr - %l1
+ *	uaddr - %l2
+ *	count - %l3
+ */
+#define SAVE_SRC	%l1
+#define SAVE_DST	%l2
+#define SAVE_COUNT	%l3
+
+#define SM_SAVE_SRC		%g4
+#define SM_SAVE_DST		%g5
+#define SM_SAVE_COUNT		%o5
+#define ERRNO		%l5
+
+
+#define REAL_LOFAULT	%l4
+/*
+ * Generic copyio fault handler.  This is the first line of defense when a
+ * fault occurs in (x)copyin/(x)copyout.  In order for this to function
+ * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
+ * This allows us to share common code for all the flavors of the copy
+ * operations, including the _noerr versions.
+ *
+ * Note that this function will restore the original input parameters before
+ * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
+ * member of the t_copyop structure, if needed.
+ */
+	ENTRY(copyio_fault)
+	membar	#Sync
+	mov	%g1,ERRNO			! save errno in ERRNO
+	btst	FPUSED_FLAG, %l6
+	bz	%ncc, 1f
+	  nop
+
+	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
+	wr	%o2, 0, %gsr    	! restore gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz,pt	%icc, 4f
+	  nop
+
+	BLD_FPQ2Q4_FROMSTACK(%o2)
+
+	ba,pt	%ncc, 1f
+	  wr	%o3, 0, %fprs   	! restore fprs
+
+4:
+	FZEROQ2Q4
+	wr	%o3, 0, %fprs   	! restore fprs
+
+1:
+	andn	%l6, FPUSED_FLAG, %l6
+	membar	#Sync
+	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
+	FP_ALLOWMIGRATE(5, 6)
+
+	mov	SAVE_SRC, %i0
+	mov	SAVE_DST, %i1
+	jmp	REAL_LOFAULT
+	  mov	SAVE_COUNT, %i2
+
+	SET_SIZE(copyio_fault)
+
+
+#endif
+
+#if defined(lint)
+
+/*ARGSUSED*/
+int
+copyout(const void *kaddr, void *uaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(copyout)
+
+	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
+	bleu,pt	%ncc, .copyout_small		! go to larger cases
+	  xor	%o0, %o1, %o3			! are src, dst alignable?
+	btst	7, %o3				!
+	bz,pt	%ncc, .copyout_8		! check for longword alignment
+	  nop
+	btst	1, %o3				! 
+	bz,pt	%ncc, .copyout_2		! check for half-word
+	  nop
+	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyout_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
+	  nop
+.copyout_2:
+	btst	3, %o3				!
+	bz,pt	%ncc, .copyout_4		! check for word alignment
+	  nop
+	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyout_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
+	  nop
+.copyout_4:
+	! already checked longword, must be word aligned
+	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyout_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
+	  nop
+.copyout_8:
+	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyout_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
+	  nop
+
+	.align	16
+	nop				! instruction alignment
+					! see discussion at start of file
+.copyout_small:
+	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
+	or	%o5, %lo(.sm_copyout_err), %o5
+	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
+.sm_do_copyout:
+	mov	%o0, SM_SAVE_SRC
+	mov	%o1, SM_SAVE_DST
+	cmp	%o2, SHORTCOPY		! check for really short case
+	bleu,pt	%ncc, .co_sm_left	!
+	  mov	%o2, SM_SAVE_COUNT
+	cmp	%o2, CHKSIZE		! check for medium length cases
+	bgu,pn	%ncc, .co_med		!
+	  or	%o0, %o1, %o3		! prepare alignment check
+	andcc	%o3, 0x3, %g0		! test for alignment
+	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
+.co_sm_movebytes:
+	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
+.co_sm_notalign4:
+	ldub	[%o0], %o3		! read byte
+	subcc	%o2, 4, %o2		! reduce count by 4
+	stba	%o3, [%o1]ASI_USER	! write byte
+	inc	%o1			! advance DST by 1
+	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
+	add	%o0, 4, %o0		! advance SRC by 4
+	stba	%o3, [%o1]ASI_USER
+	inc	%o1			! advance DST by 1
+	ldub	[%o0 - 2], %o3
+	stba	%o3, [%o1]ASI_USER
+	inc	%o1			! advance DST by 1
+	ldub	[%o0 - 1], %o3
+	stba	%o3, [%o1]ASI_USER
+	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
+	  inc	%o1			! advance DST by 1
+	add	%o2, 3, %o2		! restore count
+.co_sm_left:
+	tst	%o2
+	bz,pt	%ncc, .co_sm_exit	! check for zero length
+	  nop
+	ldub	[%o0], %o3		! load one byte
+	deccc	%o2			! reduce count for cc test
+	bz,pt	%ncc, .co_sm_exit
+	  stba	%o3,[%o1]ASI_USER	! store one byte
+	ldub	[%o0 + 1], %o3		! load second byte
+	deccc	%o2
+	inc	%o1
+	bz,pt	%ncc, .co_sm_exit
+	  stba	%o3,[%o1]ASI_USER	! store second byte
+	ldub	[%o0 + 2], %o3		! load third byte
+	inc	%o1
+	stba	%o3,[%o1]ASI_USER	! store third byte
+	membar	#Sync				! sync error barrier
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+	.align	16
+.co_sm_words:
+	lduw	[%o0], %o3		! read word
+.co_sm_wordx:
+	subcc	%o2, 8, %o2		! update count
+	stwa	%o3, [%o1]ASI_USER	! write word
+	add	%o0, 8, %o0		! update SRC
+	lduw	[%o0 - 4], %o3		! read word
+	add	%o1, 4, %o1		! update DST
+	stwa	%o3, [%o1]ASI_USER	! write word
+	bgt,pt	%ncc, .co_sm_words	! loop til done
+	  add	%o1, 4, %o1		! update DST
+	addcc	%o2, 7, %o2		! restore count
+	bz,pt	%ncc, .co_sm_exit
+	  nop
+	deccc	%o2
+	bz,pt	%ncc, .co_sm_byte
+.co_sm_half:
+	  subcc	%o2, 2, %o2		! reduce count by 2
+	lduh	[%o0], %o3		! read half word
+	add	%o0, 2, %o0		! advance SRC by 2
+	stha	%o3, [%o1]ASI_USER	! write half word
+	bgt,pt	%ncc, .co_sm_half	! loop til done
+	  add	%o1, 2, %o1		! advance DST by 2
+	addcc	%o2, 1, %o2		! restore count
+	bz,pt	%ncc, .co_sm_exit
+	  nop
+.co_sm_byte:
+	ldub	[%o0], %o3
+	stba	%o3, [%o1]ASI_USER
+	membar	#Sync				! sync error barrier
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+	.align 16
+.co_sm_word:
+	subcc	%o2, 4, %o2		! update count
+	bgt,pt	%ncc, .co_sm_wordx
+	  lduw	[%o0], %o3		! read word
+	addcc	%o2, 3, %o2		! restore count
+	bz,pt	%ncc, .co_sm_exit
+	  stwa	%o3, [%o1]ASI_USER	! write word
+	deccc	%o2			! reduce count for cc test
+	ldub	[%o0 + 4], %o3		! load one byte
+	add	%o1, 4, %o1
+	bz,pt	%ncc, .co_sm_exit
+	  stba	%o3, [%o1]ASI_USER	! store one byte
+	ldub	[%o0 + 5], %o3		! load second byte
+	deccc	%o2
+	inc	%o1
+	bz,pt	%ncc, .co_sm_exit
+	  stba	%o3, [%o1]ASI_USER	! store second byte
+	ldub	[%o0 + 6], %o3		! load third byte
+	inc	%o1
+	stba	%o3, [%o1]ASI_USER	! store third byte
+.co_sm_exit:
+	  membar	#Sync				! sync error barrier
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+
+	.align 16
+.co_med:
+	xor	%o0, %o1, %o3		! setup alignment check
+	btst	1, %o3
+	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
+	  nop
+	btst	3, %o3
+	bnz,pt	%ncc, .co_med_half	! halfword aligned
+	  nop
+	btst	7, %o3
+	bnz,pt	%ncc, .co_med_word	! word aligned
+	  nop
+.co_med_long:
+	btst	3, %o0			! check for
+	bz,pt	%ncc, .co_med_long1	! word alignment
+	  nop
+.co_med_long0:
+	ldub	[%o0], %o3		! load one byte
+	inc	%o0
+	stba	%o3,[%o1]ASI_USER	! store byte
+	inc	%o1
+	btst	3, %o0
+	bnz,pt	%ncc, .co_med_long0
+	  dec	%o2
+.co_med_long1:			! word aligned
+	btst	7, %o0			! check for long word
+	bz,pt	%ncc, .co_med_long2
+	  nop
+	lduw	[%o0], %o3		! load word
+	add	%o0, 4, %o0		! advance SRC by 4
+	stwa	%o3, [%o1]ASI_USER	! store word
+	add	%o1, 4, %o1		! advance DST by 4
+	sub	%o2, 4, %o2		! reduce count by 4
+!
+!  Now long word aligned and have at least 32 bytes to move
+!
+.co_med_long2:
+	sub	%o2, 31, %o2		! adjust count to allow cc zero test
+	sub	%o1, 8, %o1		! adjust pointer to allow store in
+					! branch delay slot instead of add
+.co_med_lmove:
+	add	%o1, 8, %o1		! advance DST by 8
+	ldx	[%o0], %o3		! read long word
+	subcc	%o2, 32, %o2		! reduce count by 32
+	stxa	%o3, [%o1]ASI_USER	! write long word
+	add	%o1, 8, %o1		! advance DST by 8
+	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
+	add	%o0, 32, %o0		! advance SRC by 32
+	stxa	%o3, [%o1]ASI_USER
+	ldx	[%o0 - 16], %o3
+	add	%o1, 8, %o1		! advance DST by 8
+	stxa	%o3, [%o1]ASI_USER
+	ldx	[%o0 - 8], %o3
+	add	%o1, 8, %o1		! advance DST by 8
+	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
+	  stxa	%o3, [%o1]ASI_USER
+	add	%o1, 8, %o1		! advance DST by 8
+	addcc	%o2, 24, %o2		! restore count to long word offset
+	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
+	  nop
+.co_med_lword:
+	ldx	[%o0], %o3		! read long word
+	subcc	%o2, 8, %o2		! reduce count by 8
+	stxa	%o3, [%o1]ASI_USER	! write long word
+	add	%o0, 8, %o0		! advance SRC by 8
+	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
+	  add	%o1, 8, %o1		! advance DST by 8
+.co_med_lextra:
+	addcc	%o2, 7, %o2		! restore rest of count
+	bz,pt	%ncc, .co_sm_exit	! if zero, then done
+	  deccc	%o2
+	bz,pt	%ncc, .co_sm_byte
+	  nop
+	ba,pt	%ncc, .co_sm_half
+	  nop
+
+	.align 16
+	nop				! instruction alignment
+					! see discussion at start of file
+.co_med_word:
+	btst	3, %o0			! check for
+	bz,pt	%ncc, .co_med_word1	! word alignment
+	  nop
+.co_med_word0:
+	ldub	[%o0], %o3		! load one byte
+	inc	%o0
+	stba	%o3,[%o1]ASI_USER	! store byte
+	inc	%o1
+	btst	3, %o0
+	bnz,pt	%ncc, .co_med_word0
+	  dec	%o2
+!
+!  Now word aligned and have at least 36 bytes to move
+!
+.co_med_word1:
+	sub	%o2, 15, %o2		! adjust count to allow cc zero test
+.co_med_wmove:
+	lduw	[%o0], %o3		! read word
+	subcc	%o2, 16, %o2		! reduce count by 16
+	stwa	%o3, [%o1]ASI_USER	! write word
+	add	%o1, 4, %o1		! advance DST by 4
+	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
+	add	%o0, 16, %o0		! advance SRC by 16
+	stwa	%o3, [%o1]ASI_USER
+	add	%o1, 4, %o1		! advance DST by 4
+	lduw	[%o0 - 8], %o3
+	stwa	%o3, [%o1]ASI_USER
+	add	%o1, 4, %o1		! advance DST by 4
+	lduw	[%o0 - 4], %o3
+	stwa	%o3, [%o1]ASI_USER
+	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
+	  add	%o1, 4, %o1		! advance DST by 4
+	addcc	%o2, 12, %o2		! restore count to word offset
+	ble,pt	%ncc, .co_med_wextra	! check for more words to move
+	  nop
+.co_med_word2:
+	lduw	[%o0], %o3		! read word
+	subcc	%o2, 4, %o2		! reduce count by 4
+	stwa	%o3, [%o1]ASI_USER	! write word
+	add	%o0, 4, %o0		! advance SRC by 4
+	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
+	  add	%o1, 4, %o1		! advance DST by 4
+.co_med_wextra:
+	addcc	%o2, 3, %o2		! restore rest of count
+	bz,pt	%ncc, .co_sm_exit	! if zero, then done
+	  deccc	%o2
+	bz,pt	%ncc, .co_sm_byte
+	  nop
+	ba,pt	%ncc, .co_sm_half
+	  nop
+
+	.align 16
+	nop				! instruction alignment
+	nop				! see discussion at start of file
+	nop
+.co_med_half:
+	btst	1, %o0			! check for
+	bz,pt	%ncc, .co_med_half1	! half word alignment
+	  nop
+	ldub	[%o0], %o3		! load one byte
+	inc	%o0
+	stba	%o3,[%o1]ASI_USER	! store byte
+	inc	%o1
+	dec	%o2
+!
+!  Now half word aligned and have at least 38 bytes to move
+!
+.co_med_half1:
+	sub	%o2, 7, %o2		! adjust count to allow cc zero test
+.co_med_hmove:
+	lduh	[%o0], %o3		! read half word
+	subcc	%o2, 8, %o2		! reduce count by 8
+	stha	%o3, [%o1]ASI_USER	! write half word
+	add	%o1, 2, %o1		! advance DST by 2
+	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
+	add	%o0, 8, %o0		! advance SRC by 8
+	stha	%o3, [%o1]ASI_USER
+	add	%o1, 2, %o1		! advance DST by 2
+	lduh	[%o0 - 4], %o3
+	stha	%o3, [%o1]ASI_USER
+	add	%o1, 2, %o1		! advance DST by 2
+	lduh	[%o0 - 2], %o3
+	stha	%o3, [%o1]ASI_USER
+	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
+	  add	%o1, 2, %o1		! advance DST by 2
+	addcc	%o2, 7, %o2		! restore count
+	bz,pt	%ncc, .co_sm_exit
+	  deccc	%o2
+	bz,pt	%ncc, .co_sm_byte
+	  nop
+	ba,pt	%ncc, .co_sm_half
+	  nop
+
+/*
+ * We got here because of a fault during short copyout.
+ * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
+ */
+.sm_copyout_err:
+	membar	#Sync
+	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
+	mov	SM_SAVE_SRC, %o0
+	mov	SM_SAVE_DST, %o1
+	mov	SM_SAVE_COUNT, %o2
+	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
+	tst	%o3
+	bz,pt	%ncc, 3f			! if not, return error
+	  nop
+	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
+	jmp	%o5				! original arguments
+	  nop
+3:
+	retl
+	  or	%g0, -1, %o0		! return error value
+
+	SET_SIZE(copyout)
+
+/*
+ * The _more entry points are not intended to be used directly by
+ * any caller from outside this file.  They are provided to allow
+ * profiling and dtrace of the portions of the copy code that uses
+ * the floating point registers.
+ * This entry is particularly important as DTRACE (at least as of
+ * 4/2004) does not support leaf functions.
+ */
+
+	ENTRY(copyout_more)
+.copyout_more:
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	set	.copyout_err, REAL_LOFAULT
+
+/*
+ * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
+ */
+.do_copyout:
+        set     copyio_fault, %l7		! .copyio_fault is lofault val
+
+	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
+	membar	#Sync				! sync error barrier
+	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
+
+	mov	%i0, SAVE_SRC
+	mov	%i1, SAVE_DST
+	mov	%i2, SAVE_COUNT
+
+	FP_NOMIGRATE(6, 7)
+
+	rd	%fprs, %o2		! check for unused fp
+	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
+	btst	FPRS_FEF, %o2
+	bz,a,pt	%icc, .do_blockcopyout
+	  wr	%g0, FPRS_FEF, %fprs
+
+	BST_FPQ2Q4_TOSTACK(%o2)
+
+.do_blockcopyout:
+	rd	%gsr, %o2
+	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
+	or	%l6, FPUSED_FLAG, %l6
+
+	andcc	DST, VIS_BLOCKSIZE - 1, TMP
+	mov	ASI_USER, %asi
+	bz,pt	%ncc, 2f
+	  neg	TMP
+	add	TMP, VIS_BLOCKSIZE, TMP
+
+	! TMP = bytes required to align DST on FP_BLOCK boundary
+	! Using SRC as a tmp here
+	cmp	TMP, 3
+	bleu,pt	%ncc, 1f
+	  sub	CNT,TMP,CNT		! adjust main count
+	sub	TMP, 3, TMP		! adjust for end of loop test
+.co_blkalign:
+	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
+	stba	SRC, [DST]%asi
+	subcc	TMP, 4, TMP
+	ldub	[REALSRC + 1], SRC
+	add	REALSRC, 4, REALSRC
+	stba	SRC, [DST + 1]%asi
+	ldub	[REALSRC - 2], SRC
+	add	DST, 4, DST
+	stba	SRC, [DST - 2]%asi
+	ldub	[REALSRC - 1], SRC
+	bgu,pt	%ncc, .co_blkalign
+	  stba	SRC, [DST - 1]%asi
+
+	addcc	TMP, 3, TMP		! restore count adjustment
+	bz,pt	%ncc, 2f		! no bytes left?
+	  nop
+1:	ldub	[REALSRC], SRC
+	inc	REALSRC
+	inc	DST
+	deccc	TMP
+	bgu	%ncc, 1b
+	  stba	SRC, [DST - 1]%asi
+
+2:
+	andn	REALSRC, 0x7, SRC
+	alignaddr REALSRC, %g0, %g0
+
+	! SRC - 8-byte aligned
+	! DST - 64-byte aligned
+	prefetch [SRC], #one_read
+	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
+	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
+	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
+	ldd	[SRC], %f16
+#if CHEETAH_PREFETCH > 4
+	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
+#endif
+	ldd	[SRC + 0x08], %f18
+#if CHEETAH_PREFETCH > 5
+	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
+#endif
+	ldd	[SRC + 0x10], %f20
+#if CHEETAH_PREFETCH > 6
+	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
+#endif
+	faligndata %f16, %f18, %f48
+	ldd	[SRC + 0x18], %f22
+#if CHEETAH_PREFETCH > 7
+	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
+#endif
+	faligndata %f18, %f20, %f50
+	ldd	[SRC + 0x20], %f24
+	faligndata %f20, %f22, %f52
+	ldd	[SRC + 0x28], %f26
+	faligndata %f22, %f24, %f54
+	ldd	[SRC + 0x30], %f28
+	faligndata %f24, %f26, %f56
+	ldd	[SRC + 0x38], %f30
+	faligndata %f26, %f28, %f58
+	ldd	[SRC + VIS_BLOCKSIZE], %f16
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	SRC, VIS_BLOCKSIZE, SRC
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	ba,a,pt	%ncc, 1f
+	  nop
+	.align	16
+1:
+	ldd	[SRC + 0x08], %f18
+	faligndata %f28, %f30, %f60
+	ldd	[SRC + 0x10], %f20
+	faligndata %f30, %f16, %f62
+	stda	%f48, [DST]ASI_BLK_AIUS
+	ldd	[SRC + 0x18], %f22
+	faligndata %f16, %f18, %f48
+	ldd	[SRC + 0x20], %f24
+	faligndata %f18, %f20, %f50
+	ldd	[SRC + 0x28], %f26
+	faligndata %f20, %f22, %f52
+	ldd	[SRC + 0x30], %f28
+	faligndata %f22, %f24, %f54
+	ldd	[SRC + 0x38], %f30
+	faligndata %f24, %f26, %f56
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	ldd	[SRC + VIS_BLOCKSIZE], %f16
+	faligndata %f26, %f28, %f58
+	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
+	add	DST, VIS_BLOCKSIZE, DST
+	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	cmp	CNT, VIS_BLOCKSIZE + 8
+	bgu,pt	%ncc, 1b
+	  add	SRC, VIS_BLOCKSIZE, SRC
+
+	! only if REALSRC & 0x7 is 0
+	cmp	CNT, VIS_BLOCKSIZE
+	bne	%ncc, 3f
+	  andcc	REALSRC, 0x7, %g0
+	bz,pt	%ncc, 2f
+	  nop
+3:	
+	faligndata %f28, %f30, %f60
+	faligndata %f30, %f16, %f62
+	stda	%f48, [DST]ASI_BLK_AIUS
+	add	DST, VIS_BLOCKSIZE, DST
+	ba,pt	%ncc, 3f
+	  nop
+2:
+	ldd	[SRC + 0x08], %f18
+	fsrc1	%f28, %f60
+	ldd	[SRC + 0x10], %f20
+	fsrc1	%f30, %f62
+	stda	%f48, [DST]ASI_BLK_AIUS
+	ldd	[SRC + 0x18], %f22
+	fsrc1	%f16, %f48
+	ldd	[SRC + 0x20], %f24
+	fsrc1	%f18, %f50
+	ldd	[SRC + 0x28], %f26
+	fsrc1	%f20, %f52
+	ldd	[SRC + 0x30], %f28
+	fsrc1	%f22, %f54
+	ldd	[SRC + 0x38], %f30
+	fsrc1	%f24, %f56
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	DST, VIS_BLOCKSIZE, DST
+	add	SRC, VIS_BLOCKSIZE, SRC
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	fsrc1	%f26, %f58
+	fsrc1	%f28, %f60
+	fsrc1	%f30, %f62
+	stda	%f48, [DST]ASI_BLK_AIUS
+	add	DST, VIS_BLOCKSIZE, DST
+	ba,a,pt	%ncc, 4f
+	  nop
+
+3:	tst	CNT
+	bz,a	%ncc, 4f
+	  nop
+
+5:	ldub	[REALSRC], TMP
+	inc	REALSRC
+	inc	DST
+	deccc	CNT
+	bgu	%ncc, 5b
+	  stba	TMP, [DST - 1]%asi
+4:
+
+.copyout_exit:
+	membar	#Sync
+
+	FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8)
+	FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9)
+	FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9)	! lose outputs
+
+	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
+	wr	%o2, 0, %gsr		! restore gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz,pt	%icc, 4f
+	  nop
+
+	BLD_FPQ2Q4_FROMSTACK(%o2)
+
+	ba,pt	%ncc, 1f
+	  wr	%o3, 0, %fprs		! restore fprs
+
+4:
+	FZEROQ2Q4
+	wr	%o3, 0, %fprs		! restore fprs
+
+1:
+	membar	#Sync
+	andn	%l6, FPUSED_FLAG, %l6
+	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	FP_ALLOWMIGRATE(5, 6)
+	ret
+	  restore	%g0, 0, %o0
+
+/*
+ * We got here because of a fault during copyout.
+ * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
+ */
+.copyout_err:
+	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
+	tst	%o4
+	bz,pt	%ncc, 2f			! if not, return error
+	  nop
+	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
+	jmp	%g2				! original arguments
+	  restore %g0, 0, %g0			! dispose of copy window
+2:
+        ret
+	  restore %g0, -1, %o0			! return error value
+
+
+	SET_SIZE(copyout_more)
+
+#endif	/* lint */
+
+
+#ifdef	lint
+
+/*ARGSUSED*/
+int
+xcopyout(const void *kaddr, void *uaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(xcopyout)
+	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
+	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
+	  xor	%o0, %o1, %o3			! are src, dst alignable?
+	btst	7, %o3				!
+	bz,pt	%ncc, .xcopyout_8		!
+	  nop
+	btst	1, %o3				! 
+	bz,pt	%ncc, .xcopyout_2		! check for half-word
+	  nop
+	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	tst	%o3
+	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .xcopyout_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
+	  nop
+.xcopyout_2:
+	btst	3, %o3				!
+	bz,pt	%ncc, .xcopyout_4		! check for word alignment
+	  nop
+	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .xcopyout_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
+	  nop
+.xcopyout_4:
+	! already checked longword, must be word aligned
+	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	tst	%o3
+	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .xcopyout_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
+	  nop
+.xcopyout_8:
+	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	tst	%o3
+	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .xcopyout_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
+	  nop
+
+.xcopyout_small:
+	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
+	or	%o5, %lo(.sm_xcopyout_err), %o5
+	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
+	membar	#Sync				! sync error barrier
+	ba,pt	%ncc, .sm_do_copyout		! common code
+	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
+
+.xcopyout_more:
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	sethi	%hi(.xcopyout_err), REAL_LOFAULT
+	ba,pt	%ncc, .do_copyout		! common code
+	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
+
+/*
+ * We got here because of fault during xcopyout
+ * Errno value is in ERRNO
+ */
+.xcopyout_err:
+	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
+	tst	%o4
+	bz,pt	%ncc, 2f			! if not, return error
+	  nop
+	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
+	jmp	%g2				! original arguments
+	  restore %g0, 0, %g0			! dispose of copy window
+2:
+        ret
+	  restore ERRNO, 0, %o0			! return errno value
+
+.sm_xcopyout_err:
+
+	membar	#Sync
+	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
+	mov	SM_SAVE_SRC, %o0
+	mov	SM_SAVE_DST, %o1
+	mov	SM_SAVE_COUNT, %o2
+	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
+	tst	%o3
+	bz,pt	%ncc, 3f			! if not, return error
+	  nop
+	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
+	jmp	%o5				! original arguments
+	  nop
+3:
+	retl
+	  or	%g1, 0, %o0		! return errno value
+
+	SET_SIZE(xcopyout)
+
+#endif	/* lint */
+	
+#ifdef	lint
+
+/*ARGSUSED*/
+int
+xcopyout_little(const void *kaddr, void *uaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(xcopyout_little)
+	sethi	%hi(.xcopyio_err), %o5
+	or	%o5, %lo(.xcopyio_err), %o5
+	ldn	[THREAD_REG + T_LOFAULT], %o4
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]
+	mov	%o4, %o5
+
+	subcc	%g0, %o2, %o3
+	add	%o0, %o2, %o0
+	bz,pn	%ncc, 2f		! check for zero bytes
+	  sub	%o2, 1, %o4
+	add	%o0, %o4, %o0		! start w/last byte
+	add	%o1, %o2, %o1
+	ldub	[%o0 + %o3], %o4
+
+1:	stba	%o4, [%o1 + %o3]ASI_AIUSL
+	inccc	%o3
+	sub	%o0, 2, %o0		! get next byte
+	bcc,a,pt %ncc, 1b
+	  ldub	[%o0 + %o3], %o4
+
+2:
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return (0)
+
+	SET_SIZE(xcopyout_little)
+
+#endif	/* lint */
+
+/*
+ * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
+ */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+int
+copyin(const void *uaddr, void *kaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(copyin)
+	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
+	bleu,pt	%ncc, .copyin_small		! go to larger cases
+	  xor	%o0, %o1, %o3			! are src, dst alignable?
+	btst	7, %o3				!
+	bz,pt	%ncc, .copyin_8			! check for longword alignment
+	  nop
+	btst	1, %o3				! 
+	bz,pt	%ncc, .copyin_2			! check for half-word
+	  nop
+	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyin_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
+	  nop
+.copyin_2:
+	btst	3, %o3				!
+	bz,pt	%ncc, .copyin_4			! check for word alignment
+	  nop
+	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyin_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
+	  nop
+.copyin_4:
+	! already checked longword, must be word aligned
+	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyin_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
+	  nop
+.copyin_8:
+	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyin_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
+	  nop
+
+	.align	16
+	nop				! instruction alignment
+					! see discussion at start of file
+.copyin_small:
+	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault 
+	or	%o5, %lo(.sm_copyin_err), %o5
+	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]
+.sm_do_copyin:
+	mov	%o0, SM_SAVE_SRC
+	mov	%o1, SM_SAVE_DST
+	cmp	%o2, SHORTCOPY		! check for really short case
+	bleu,pt	%ncc, .ci_sm_left	!
+	  mov	%o2, SM_SAVE_COUNT
+	cmp	%o2, CHKSIZE		! check for medium length cases
+	bgu,pn	%ncc, .ci_med		!
+	  or	%o0, %o1, %o3		! prepare alignment check
+	andcc	%o3, 0x3, %g0		! test for alignment
+	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
+.ci_sm_movebytes:
+	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
+.ci_sm_notalign4:
+	lduba	[%o0]ASI_USER, %o3	! read byte
+	subcc	%o2, 4, %o2		! reduce count by 4
+	stb	%o3, [%o1]		! write byte
+	add	%o0, 1, %o0		! advance SRC by 1
+	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
+	add	%o0, 1, %o0		! advance SRC by 1
+	stb	%o3, [%o1 + 1]
+	add	%o1, 4, %o1		! advance DST by 4
+	lduba	[%o0]ASI_USER, %o3
+	add	%o0, 1, %o0		! advance SRC by 1
+	stb	%o3, [%o1 - 2]
+	lduba	[%o0]ASI_USER, %o3
+	add	%o0, 1, %o0		! advance SRC by 1
+	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
+	  stb	%o3, [%o1 - 1]
+	add	%o2, 3, %o2		! restore count
+.ci_sm_left:
+	tst	%o2
+	bz,pt	%ncc, .ci_sm_exit
+	  nop
+	lduba	[%o0]ASI_USER, %o3		! load one byte
+	deccc	%o2			! reduce count for cc test
+	bz,pt	%ncc, .ci_sm_exit
+	  stb	%o3,[%o1]		! store one byte
+	inc	%o0
+	lduba	[%o0]ASI_USER, %o3	! load second byte
+	deccc	%o2
+	bz,pt	%ncc, .ci_sm_exit
+	  stb	%o3,[%o1 + 1]		! store second byte
+	inc	%o0
+	lduba	[%o0]ASI_USER, %o3	! load third byte
+	stb	%o3,[%o1 + 2]		! store third byte
+	membar	#Sync				! sync error barrier
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+	.align	16
+.ci_sm_words:
+	lduwa	[%o0]ASI_USER, %o3		! read word
+.ci_sm_wordx:
+	subcc	%o2, 8, %o2		! update count
+	stw	%o3, [%o1]		! write word
+	add	%o0, 4, %o0		! update SRC
+	add	%o1, 8, %o1		! update DST
+	lduwa	[%o0]ASI_USER, %o3	! read word
+	add	%o0, 4, %o0		! update SRC
+	bgt,pt	%ncc, .ci_sm_words	! loop til done
+	  stw	%o3, [%o1 - 4]		! write word
+	addcc	%o2, 7, %o2		! restore count
+	bz,pt	%ncc, .ci_sm_exit
+	  nop
+	deccc	%o2
+	bz,pt	%ncc, .ci_sm_byte
+.ci_sm_half:
+	  subcc	%o2, 2, %o2		! reduce count by 2
+	lduha	[%o0]ASI_USER, %o3	! read half word
+	add	%o0, 2, %o0		! advance SRC by 2
+	add	%o1, 2, %o1		! advance DST by 2
+	bgt,pt	%ncc, .ci_sm_half	! loop til done
+	  sth	%o3, [%o1 - 2]		! write half word
+	addcc	%o2, 1, %o2		! restore count
+	bz,pt	%ncc, .ci_sm_exit
+	  nop
+.ci_sm_byte:
+	lduba	[%o0]ASI_USER, %o3
+	stb	%o3, [%o1]
+	membar	#Sync				! sync error barrier
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+	.align	16
+.ci_sm_word:
+	subcc	%o2, 4, %o2		! update count
+	bgt,pt	%ncc, .ci_sm_wordx
+	  lduwa	[%o0]ASI_USER, %o3		! read word
+	addcc	%o2, 3, %o2		! restore count
+	bz,pt	%ncc, .ci_sm_exit
+	  stw	%o3, [%o1]		! write word
+	deccc	%o2			! reduce count for cc test
+	add	%o0, 4, %o0
+	lduba	[%o0]ASI_USER, %o3	! load one byte
+	bz,pt	%ncc, .ci_sm_exit
+	  stb	%o3, [%o1 + 4]		! store one byte
+	inc	%o0
+	lduba	[%o0]ASI_USER, %o3	! load second byte
+	deccc	%o2
+	bz,pt	%ncc, .ci_sm_exit
+	  stb	%o3, [%o1 + 5]		! store second byte
+	inc	%o0
+	lduba	[%o0]ASI_USER, %o3	! load third byte
+	stb	%o3, [%o1 + 6]		! store third byte
+.ci_sm_exit:
+	membar	#Sync				! sync error barrier
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return 0
+
+	.align 16
+.ci_med:
+	xor	%o0, %o1, %o3		! setup alignment check
+	btst	1, %o3
+	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
+	  nop
+	btst	3, %o3
+	bnz,pt	%ncc, .ci_med_half	! halfword aligned
+	  nop
+	btst	7, %o3
+	bnz,pt	%ncc, .ci_med_word	! word aligned
+	  nop
+.ci_med_long:
+	btst	3, %o0			! check for
+	bz,pt	%ncc, .ci_med_long1	! word alignment
+	  nop
+.ci_med_long0:
+	lduba	[%o0]ASI_USER, %o3		! load one byte
+	inc	%o0
+	stb	%o3,[%o1]		! store byte
+	inc	%o1
+	btst	3, %o0
+	bnz,pt	%ncc, .ci_med_long0
+	  dec	%o2
+.ci_med_long1:			! word aligned
+	btst	7, %o0			! check for long word
+	bz,pt	%ncc, .ci_med_long2
+	  nop
+	lduwa	[%o0]ASI_USER, %o3	! load word
+	add	%o0, 4, %o0		! advance SRC by 4
+	stw	%o3, [%o1]		! store word
+	add	%o1, 4, %o1		! advance DST by 4
+	sub	%o2, 4, %o2		! reduce count by 4
+!
+!  Now long word aligned and have at least 32 bytes to move
+!
+.ci_med_long2:
+	sub	%o2, 31, %o2		! adjust count to allow cc zero test
+.ci_med_lmove:
+	ldxa	[%o0]ASI_USER, %o3	! read long word
+	subcc	%o2, 32, %o2		! reduce count by 32
+	stx	%o3, [%o1]		! write long word
+	add	%o0, 8, %o0		! advance SRC by 8
+	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
+	add	%o0, 8, %o0		! advance SRC by 8
+	stx	%o3, [%o1 + 8]
+	add	%o1, 32, %o1		! advance DST by 32
+	ldxa	[%o0]ASI_USER, %o3
+	add	%o0, 8, %o0		! advance SRC by 8
+	stx	%o3, [%o1 - 16]
+	ldxa	[%o0]ASI_USER, %o3
+	add	%o0, 8, %o0		! advance SRC by 8
+	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
+	  stx	%o3, [%o1 - 8]
+	addcc	%o2, 24, %o2		! restore count to long word offset
+	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
+	  nop
+.ci_med_lword:
+	ldxa	[%o0]ASI_USER, %o3	! read long word
+	subcc	%o2, 8, %o2		! reduce count by 8
+	stx	%o3, [%o1]		! write long word
+	add	%o0, 8, %o0		! advance SRC by 8
+	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
+	  add	%o1, 8, %o1		! advance DST by 8
+.ci_med_lextra:
+	addcc	%o2, 7, %o2		! restore rest of count
+	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
+	  deccc	%o2
+	bz,pt	%ncc, .ci_sm_byte
+	  nop
+	ba,pt	%ncc, .ci_sm_half
+	  nop
+
+	.align 16
+	nop				! instruction alignment
+					! see discussion at start of file
+.ci_med_word:
+	btst	3, %o0			! check for
+	bz,pt	%ncc, .ci_med_word1	! word alignment
+	  nop
+.ci_med_word0:
+	lduba	[%o0]ASI_USER, %o3	! load one byte
+	inc	%o0
+	stb	%o3,[%o1]		! store byte
+	inc	%o1
+	btst	3, %o0
+	bnz,pt	%ncc, .ci_med_word0
+	  dec	%o2
+!
+!  Now word aligned and have at least 36 bytes to move
+!
+.ci_med_word1:
+	sub	%o2, 15, %o2		! adjust count to allow cc zero test
+.ci_med_wmove:
+	lduwa	[%o0]ASI_USER, %o3	! read word
+	subcc	%o2, 16, %o2		! reduce count by 16
+	stw	%o3, [%o1]		! write word
+	add	%o0, 4, %o0		! advance SRC by 4
+	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
+	add	%o0, 4, %o0		! advance SRC by 4
+	stw	%o3, [%o1 + 4]
+	add	%o1, 16, %o1		! advance DST by 16
+	lduwa	[%o0]ASI_USER, %o3
+	add	%o0, 4, %o0		! advance SRC by 4
+	stw	%o3, [%o1 - 8]
+	lduwa	[%o0]ASI_USER, %o3
+	add	%o0, 4, %o0		! advance SRC by 4
+	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
+	  stw	%o3, [%o1 - 4]
+	addcc	%o2, 12, %o2		! restore count to word offset
+	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
+	  nop
+.ci_med_word2:
+	lduwa	[%o0]ASI_USER, %o3	! read word
+	subcc	%o2, 4, %o2		! reduce count by 4
+	stw	%o3, [%o1]		! write word
+	add	%o0, 4, %o0		! advance SRC by 4
+	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
+	  add	%o1, 4, %o1		! advance DST by 4
+.ci_med_wextra:
+	addcc	%o2, 3, %o2		! restore rest of count
+	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
+	  deccc	%o2
+	bz,pt	%ncc, .ci_sm_byte
+	  nop
+	ba,pt	%ncc, .ci_sm_half
+	  nop
+
+	.align 16
+	nop				! instruction alignment
+					! see discussion at start of file
+.ci_med_half:
+	btst	1, %o0			! check for
+	bz,pt	%ncc, .ci_med_half1	! half word alignment
+	  nop
+	lduba	[%o0]ASI_USER, %o3	! load one byte
+	inc	%o0
+	stb	%o3,[%o1]		! store byte
+	inc	%o1
+	dec	%o2
+!
+!  Now half word aligned and have at least 38 bytes to move
+!
+.ci_med_half1:
+	sub	%o2, 7, %o2		! adjust count to allow cc zero test
+.ci_med_hmove:
+	lduha	[%o0]ASI_USER, %o3	! read half word
+	subcc	%o2, 8, %o2		! reduce count by 8
+	sth	%o3, [%o1]		! write half word
+	add	%o0, 2, %o0		! advance SRC by 2
+	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
+	add	%o0, 2, %o0		! advance SRC by 2
+	sth	%o3, [%o1 + 2]
+	add	%o1, 8, %o1		! advance DST by 8
+	lduha	[%o0]ASI_USER, %o3
+	add	%o0, 2, %o0		! advance SRC by 2
+	sth	%o3, [%o1 - 4]
+	lduha	[%o0]ASI_USER, %o3
+	add	%o0, 2, %o0		! advance SRC by 2
+	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
+	  sth	%o3, [%o1 - 2]
+	addcc	%o2, 7, %o2		! restore count
+	bz,pt	%ncc, .ci_sm_exit
+	  deccc	%o2
+	bz,pt	%ncc, .ci_sm_byte
+	  nop
+	ba,pt	%ncc, .ci_sm_half
+	  nop
+
+.sm_copyin_err:
+	membar	#Sync
+	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
+	mov	SM_SAVE_SRC, %o0
+	mov	SM_SAVE_DST, %o1
+	mov	SM_SAVE_COUNT, %o2
+	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
+	tst	%o3
+	bz,pt	%ncc, 3f			! if not, return error
+	  nop
+	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
+	jmp	%o5				! original arguments
+	  nop
+3:
+	retl
+	  or	%g0, -1, %o0		! return errno value
+
+	SET_SIZE(copyin)
+
+
+/*
+ * The _more entry points are not intended to be used directly by
+ * any caller from outside this file.  They are provided to allow
+ * profiling and dtrace of the portions of the copy code that uses
+ * the floating point registers.
+ * This entry is particularly important as DTRACE (at least as of
+ * 4/2004) does not support leaf functions.
+ */
+
+	ENTRY(copyin_more)
+.copyin_more:
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	set	.copyin_err, REAL_LOFAULT
+
+/*
+ * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
+ */
+.do_copyin:
+	set	copyio_fault, %l7		! .copyio_fault is lofault val
+
+	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
+	membar	#Sync				! sync error barrier
+	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
+
+	mov	%i0, SAVE_SRC
+	mov	%i1, SAVE_DST
+	mov	%i2, SAVE_COUNT
+
+	FP_NOMIGRATE(6, 7)
+
+	rd	%fprs, %o2		! check for unused fp
+	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
+	btst	FPRS_FEF, %o2
+	bz,a,pt	%icc, .do_blockcopyin
+	  wr	%g0, FPRS_FEF, %fprs
+
+	BST_FPQ2Q4_TOSTACK(%o2)
+
+.do_blockcopyin:
+	rd	%gsr, %o2
+	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
+	or	%l6, FPUSED_FLAG, %l6
+
+	andcc	DST, VIS_BLOCKSIZE - 1, TMP
+	mov	ASI_USER, %asi
+	bz,pt	%ncc, 2f
+	  neg	TMP
+	add	TMP, VIS_BLOCKSIZE, TMP
+
+	! TMP = bytes required to align DST on FP_BLOCK boundary
+	! Using SRC as a tmp here
+	cmp	TMP, 3
+	bleu,pt	%ncc, 1f
+	  sub	CNT,TMP,CNT		! adjust main count
+	sub	TMP, 3, TMP		! adjust for end of loop test
+.ci_blkalign:
+	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
+	stb	SRC, [DST]
+	subcc	TMP, 4, TMP
+	lduba	[REALSRC + 1]%asi, SRC
+	add	REALSRC, 4, REALSRC
+	stb	SRC, [DST + 1]
+	lduba	[REALSRC - 2]%asi, SRC
+	add	DST, 4, DST
+	stb	SRC, [DST - 2]
+	lduba	[REALSRC - 1]%asi, SRC
+	bgu,pt	%ncc, .ci_blkalign
+	  stb	SRC, [DST - 1]
+
+	addcc	TMP, 3, TMP		! restore count adjustment
+	bz,pt	%ncc, 2f		! no bytes left?
+	  nop
+1:	lduba	[REALSRC]%asi, SRC
+	inc	REALSRC
+	inc	DST
+	deccc	TMP
+	bgu	%ncc, 1b
+	  stb	SRC, [DST - 1]
+
+2:
+	andn	REALSRC, 0x7, SRC
+	alignaddr REALSRC, %g0, %g0
+
+	! SRC - 8-byte aligned
+	! DST - 64-byte aligned
+	prefetcha [SRC]%asi, #one_read
+	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
+	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
+	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
+	ldda	[SRC]%asi, %f16
+#if CHEETAH_PREFETCH > 4
+	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
+#endif
+	ldda	[SRC + 0x08]%asi, %f18
+#if CHEETAH_PREFETCH > 5
+	prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
+#endif
+	ldda	[SRC + 0x10]%asi, %f20
+#if CHEETAH_PREFETCH > 6
+	prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
+#endif
+	faligndata %f16, %f18, %f48
+	ldda	[SRC + 0x18]%asi, %f22
+#if CHEETAH_PREFETCH > 7
+	prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
+#endif
+	faligndata %f18, %f20, %f50
+	ldda	[SRC + 0x20]%asi, %f24
+	faligndata %f20, %f22, %f52
+	ldda	[SRC + 0x28]%asi, %f26
+	faligndata %f22, %f24, %f54
+	ldda	[SRC + 0x30]%asi, %f28
+	faligndata %f24, %f26, %f56
+	ldda	[SRC + 0x38]%asi, %f30
+	faligndata %f26, %f28, %f58
+	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	SRC, VIS_BLOCKSIZE, SRC
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	ba,a,pt	%ncc, 1f
+	  nop
+	.align	16
+1:
+	ldda	[SRC + 0x08]%asi, %f18
+	faligndata %f28, %f30, %f60
+	ldda	[SRC + 0x10]%asi, %f20
+	faligndata %f30, %f16, %f62
+	stda	%f48, [DST]ASI_BLK_P
+	ldda	[SRC + 0x18]%asi, %f22
+	faligndata %f16, %f18, %f48
+	ldda	[SRC + 0x20]%asi, %f24
+	faligndata %f18, %f20, %f50
+	ldda	[SRC + 0x28]%asi, %f26
+	faligndata %f20, %f22, %f52
+	ldda	[SRC + 0x30]%asi, %f28
+	faligndata %f22, %f24, %f54
+	ldda	[SRC + 0x38]%asi, %f30
+	faligndata %f24, %f26, %f56
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
+	faligndata %f26, %f28, %f58
+	prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
+	add	DST, VIS_BLOCKSIZE, DST
+	prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	cmp	CNT, VIS_BLOCKSIZE + 8
+	bgu,pt	%ncc, 1b
+	  add	SRC, VIS_BLOCKSIZE, SRC
+
+	! only if REALSRC & 0x7 is 0
+	cmp	CNT, VIS_BLOCKSIZE
+	bne	%ncc, 3f
+	  andcc	REALSRC, 0x7, %g0
+	bz,pt	%ncc, 2f
+	  nop
+3:	
+	faligndata %f28, %f30, %f60
+	faligndata %f30, %f16, %f62
+	stda	%f48, [DST]ASI_BLK_P
+	add	DST, VIS_BLOCKSIZE, DST
+	ba,pt	%ncc, 3f
+	  nop
+2:
+	ldda	[SRC + 0x08]%asi, %f18
+	fsrc1	%f28, %f60
+	ldda	[SRC + 0x10]%asi, %f20
+	fsrc1	%f30, %f62
+	stda	%f48, [DST]ASI_BLK_P
+	ldda	[SRC + 0x18]%asi, %f22
+	fsrc1	%f16, %f48
+	ldda	[SRC + 0x20]%asi, %f24
+	fsrc1	%f18, %f50
+	ldda	[SRC + 0x28]%asi, %f26
+	fsrc1	%f20, %f52
+	ldda	[SRC + 0x30]%asi, %f28
+	fsrc1	%f22, %f54
+	ldda	[SRC + 0x38]%asi, %f30
+	fsrc1	%f24, %f56
+	sub	CNT, VIS_BLOCKSIZE, CNT
+	add	DST, VIS_BLOCKSIZE, DST
+	add	SRC, VIS_BLOCKSIZE, SRC
+	add	REALSRC, VIS_BLOCKSIZE, REALSRC
+	fsrc1	%f26, %f58
+	fsrc1	%f28, %f60
+	fsrc1	%f30, %f62
+	stda	%f48, [DST]ASI_BLK_P
+	add	DST, VIS_BLOCKSIZE, DST
+	ba,a,pt	%ncc, 4f
+	  nop
+
+3:	tst	CNT
+	bz,a	%ncc, 4f
+	  nop
+
+5:	lduba	[REALSRC]ASI_USER, TMP
+	inc	REALSRC
+	inc	DST
+	deccc	CNT
+	bgu	%ncc, 5b
+	  stb	TMP, [DST - 1]
+4:
+
+.copyin_exit:
+	membar	#Sync
+
+	FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8)
+	FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9)
+	FPRAS_CHECK(FPRAS_COPYIN, %l5, 9)	! lose outputs
+
+	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
+	wr	%o2, 0, %gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz,pt	%icc, 4f
+	  nop
+
+	BLD_FPQ2Q4_FROMSTACK(%o2)
+
+	ba,pt	%ncc, 1f
+	  wr	%o3, 0, %fprs		! restore fprs
+
+4:
+	FZEROQ2Q4
+	wr	%o3, 0, %fprs		! restore fprs
+
+1:
+	membar	#Sync				! sync error barrier
+	andn	%l6, FPUSED_FLAG, %l6
+	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	FP_ALLOWMIGRATE(5, 6)
+	ret
+	  restore	%g0, 0, %o0
+/*
+ * We got here because of a fault during copyin
+ * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
+ */
+.copyin_err:
+	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
+	tst	%o4
+	bz,pt	%ncc, 2f			! if not, return error
+	nop
+	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
+	jmp	%g2				! original arguments
+	restore %g0, 0, %g0			! dispose of copy window
+2:
+	ret
+	restore %g0, -1, %o0			! return error value
+
+
+	SET_SIZE(copyin_more)
+
+#endif	/* lint */
+
+#ifdef	lint
+
+/*ARGSUSED*/
+int
+xcopyin(const void *uaddr, void *kaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(xcopyin)
+
+	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
+	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
+	  xor	%o0, %o1, %o3			! are src, dst alignable?
+	btst	7, %o3				!
+	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
+	  nop
+	btst	1, %o3				! 
+	bz,pt	%ncc, .xcopyin_2		! check for half-word
+	  nop
+	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	tst	%o3
+	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .xcopyin_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
+	  nop
+.xcopyin_2:
+	btst	3, %o3				!
+	bz,pt	%ncc, .xcopyin_4		! check for word alignment
+	  nop
+	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .xcopyin_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
+	  nop
+.xcopyin_4:
+	! already checked longword, must be word aligned
+	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	tst	%o3
+	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .xcopyin_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
+	  nop
+.xcopyin_8:
+	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	tst	%o3
+	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .xcopyin_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
+	  nop
+
+.xcopyin_small:
+	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
+	or	%o5, %lo(.sm_xcopyin_err), %o5
+	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
+	membar	#Sync				! sync error barrier
+	ba,pt	%ncc, .sm_do_copyin		! common code
+	  stn	%o5, [THREAD_REG + T_LOFAULT]
+	
+.xcopyin_more:
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
+	ba,pt	%ncc, .do_copyin
+	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
+
+/*
+ * We got here because of fault during xcopyin
+ * Errno value is in ERRNO
+ */
+.xcopyin_err:
+	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
+	tst	%o4
+	bz,pt	%ncc, 2f			! if not, return error
+	  nop
+	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
+	jmp	%g2				! original arguments
+	  restore %g0, 0, %g0			! dispose of copy window
+2:
+        ret
+	  restore ERRNO, 0, %o0			! return errno value
+
+.sm_xcopyin_err:
+
+	membar	#Sync
+	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
+	mov	SM_SAVE_SRC, %o0
+	mov	SM_SAVE_DST, %o1
+	mov	SM_SAVE_COUNT, %o2
+	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
+	tst	%o3
+	bz,pt	%ncc, 3f			! if not, return error
+	  nop
+	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
+	jmp	%o5				! original arguments
+	  nop
+3:
+	retl
+	  or	%g1, 0, %o0		! return errno value
+
+	SET_SIZE(xcopyin)
+
+#endif	/* lint */
+
+#ifdef	lint
+
+/*ARGSUSED*/
+int
+xcopyin_little(const void *uaddr, void *kaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(xcopyin_little)
+	sethi	%hi(.xcopyio_err), %o5
+	or	%o5, %lo(.xcopyio_err), %o5
+	ldn	[THREAD_REG + T_LOFAULT], %o4
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	
+	mov	%o4, %o5
+
+	subcc	%g0, %o2, %o3
+	add	%o0, %o2, %o0
+	bz,pn	%ncc, 2f		! check for zero bytes
+	  sub	%o2, 1, %o4
+	add	%o0, %o4, %o0		! start w/last byte	
+	add	%o1, %o2, %o1
+	lduba	[%o0 + %o3]ASI_AIUSL, %o4
+
+1:	stb	%o4, [%o1 + %o3]
+	inccc	%o3
+	sub	%o0, 2, %o0		! get next byte
+	bcc,a,pt %ncc, 1b
+	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
+
+2:
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g0, %o0		! return (0)
+
+.xcopyio_err:
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	  mov	%g1, %o0
+
+	SET_SIZE(xcopyin_little)
+
+#endif	/* lint */
+
+
+/*
+ * Copy a block of storage - must not overlap (from + len <= to).
+ * No fault handler installed (to be called under on_fault())
+ */
+#if defined(lint)
+
+/* ARGSUSED */
+void
+copyin_noerr(const void *ufrom, void *kto, size_t count)
+{}
+
+#else	/* lint */
+	ENTRY(copyin_noerr)
+
+	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
+	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
+	  xor	%o0, %o1, %o3			! are src, dst alignable?
+	btst	7, %o3				!
+	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
+	  nop
+	btst	1, %o3				! 
+	bz,pt	%ncc, .copyin_ne_2		! check for half-word
+	  nop
+	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
+	  nop
+.copyin_ne_2:
+	btst	3, %o3				!
+	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
+	  nop
+	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
+	  nop
+.copyin_ne_4:
+	! already checked longword, must be word aligned
+	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
+	  nop
+.copyin_ne_8:
+	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
+	  nop
+
+.copyin_ne_small:
+	ldn	[THREAD_REG + T_LOFAULT], %o4
+	tst	%o4
+	bz,pn	%ncc, .sm_do_copyin
+	  nop
+	sethi	%hi(.sm_copyio_noerr), %o5
+	or	%o5, %lo(.sm_copyio_noerr), %o5
+	membar	#Sync				! sync error barrier
+	ba,pt	%ncc, .sm_do_copyin
+	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
+
+.copyin_noerr_more:
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	sethi	%hi(.copyio_noerr), REAL_LOFAULT
+	ba,pt	%ncc, .do_copyin
+	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
+
+.copyio_noerr:
+	jmp	%l6
+	  restore %g0,0,%g0
+
+.sm_copyio_noerr:
+	membar	#Sync
+	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
+	jmp	%o4
+	  nop
+
+	SET_SIZE(copyin_noerr)
+#endif /* lint */
+
+/*
+ * Copy a block of storage - must not overlap (from + len <= to).
+ * No fault handler installed (to be called under on_fault())
+ */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+copyout_noerr(const void *kfrom, void *uto, size_t count)
+{}
+
+#else	/* lint */
+	ENTRY(copyout_noerr)
+
+	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
+	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
+	  xor	%o0, %o1, %o3			! are src, dst alignable?
+	btst	7, %o3				!
+	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
+	  nop
+	btst	1, %o3				! 
+	bz,pt	%ncc, .copyout_ne_2		! check for half-word
+	  nop
+	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
+	  nop
+.copyout_ne_2:
+	btst	3, %o3				!
+	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
+	  nop
+	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
+	  nop
+.copyout_ne_4:
+	! already checked longword, must be word aligned
+	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
+	  nop
+.copyout_ne_8:
+	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	tst	%o3
+	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
+	  cmp	%o2, %o3			! if length <= limit
+	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
+	  nop
+	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
+	  nop
+
+.copyout_ne_small:
+	ldn	[THREAD_REG + T_LOFAULT], %o4
+	tst	%o4
+	bz,pn	%ncc, .sm_do_copyout
+	  nop
+	sethi	%hi(.sm_copyio_noerr), %o5
+	or	%o5, %lo(.sm_copyio_noerr), %o5
+	membar	#Sync				! sync error barrier
+	ba,pt	%ncc, .sm_do_copyout
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
+
+.copyout_noerr_more:
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	sethi	%hi(.copyio_noerr), REAL_LOFAULT
+	ba,pt	%ncc, .do_copyout
+	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
+
+	SET_SIZE(copyout_noerr)
+#endif /* lint */
+
+
+/*
+ * hwblkclr - clears block-aligned, block-multiple-sized regions that are
+ * longer than 256 bytes in length using spitfire's block stores.  If
+ * the criteria for using this routine are not met then it calls bzero
+ * and returns 1.  Otherwise 0 is returned indicating success.
+ * Caller is responsible for ensuring use_hw_bzero is true and that
+ * kpreempt_disable() has been called.
+ */
+#ifdef lint
+/*ARGSUSED*/
+int
+hwblkclr(void *addr, size_t len)
+{ 
+	return(0);
+}
+#else /* lint */
+	! %i0 - start address
+	! %i1 - length of region (multiple of 64)
+	! %l0 - saved fprs
+	! %l1 - pointer to saved %d0 block
+	! %l2 - saved curthread->t_lwp
+
+	ENTRY(hwblkclr)
+	! get another window w/space for one aligned block of saved fpregs
+	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
+
+	! Must be block-aligned
+	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
+	bnz,pn	%ncc, 1f
+	  nop
+
+	! ... and must be 256 bytes or more
+	cmp	%i1, 256
+	blu,pn	%ncc, 1f
+	  nop
+
+	! ... and length must be a multiple of VIS_BLOCKSIZE
+	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
+	bz,pn	%ncc, 2f
+	  nop
+
+1:	! punt, call bzero but notify the caller that bzero was used
+	mov	%i0, %o0
+	call	bzero
+	mov	%i1, %o1
+	ret
+	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
+
+2:	rd	%fprs, %l0		! check for unused fp
+	btst	FPRS_FEF, %l0
+	bz,pt	%icc, 1f
+	  nop
+
+	! save in-use fpregs on stack
+	membar	#Sync
+	add	%fp, STACK_BIAS - 65, %l1
+	and	%l1, -VIS_BLOCKSIZE, %l1
+	stda	%d0, [%l1]ASI_BLK_P
+
+1:	membar	#StoreStore|#StoreLoad|#LoadStore
+	wr	%g0, FPRS_FEF, %fprs
+	wr	%g0, ASI_BLK_P, %asi
+
+	! Clear block
+	fzero	%d0
+	fzero	%d2
+	fzero	%d4
+	fzero	%d6
+	fzero	%d8
+	fzero	%d10
+	fzero	%d12
+	fzero	%d14
+
+	mov	256, %i3
+	ba,pt	%ncc, .pz_doblock
+	  nop
+
+.pz_blkstart:	
+      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
+	stda	%d0, [%i0 + 128]%asi
+	stda	%d0, [%i0 + 64]%asi
+	stda	%d0, [%i0]%asi
+.pz_zinst:
+	add	%i0, %i3, %i0
+	sub	%i1, %i3, %i1
+.pz_doblock:
+	cmp	%i1, 256
+	bgeu,a	%ncc, .pz_blkstart
+	  stda	%d0, [%i0 + 192]%asi
+
+	cmp	%i1, 64
+	blu	%ncc, .pz_finish
+	
+	  andn	%i1, (64-1), %i3
+	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
+	set	.pz_zinst, %i4
+	sub	%i4, %i2, %i4
+	jmp	%i4
+	  nop
+
+.pz_finish:
+	membar	#Sync
+	btst	FPRS_FEF, %l0
+	bz,a	.pz_finished
+	  wr	%l0, 0, %fprs		! restore fprs
+
+	! restore fpregs from stack
+	ldda	[%l1]ASI_BLK_P, %d0
+	membar	#Sync
+	wr	%l0, 0, %fprs		! restore fprs
+
+.pz_finished:
+	ret
+	  restore	%g0, 0, %o0		! return (bzero or not)
+
+	SET_SIZE(hwblkclr)
+#endif	/* lint */
+
+#ifdef lint
+/*ARGSUSED*/
+void
+hw_pa_bcopy32(uint64_t src, uint64_t dst)
+{}
+#else /*!lint */
+	/*
+	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
+	 * using physical addresses.
+	 */
+	ENTRY_NP(hw_pa_bcopy32)
+	rdpr	%pstate, %g1
+	andn	%g1, PSTATE_IE, %g2
+	wrpr	%g0, %g2, %pstate
+
+	rdpr	%pstate, %g0
+	ldxa	[%o0]ASI_MEM, %o2
+	add	%o0, 8, %o0
+	ldxa	[%o0]ASI_MEM, %o3
+	add	%o0, 8, %o0
+	ldxa	[%o0]ASI_MEM, %o4
+	add	%o0, 8, %o0
+	ldxa	[%o0]ASI_MEM, %o5
+
+    	stxa	%g0, [%o1]ASI_DC_INVAL
+	membar	#Sync
+
+	stxa	%o2, [%o1]ASI_MEM
+	add	%o1, 8, %o1
+	stxa	%o3, [%o1]ASI_MEM
+	add	%o1, 8, %o1
+	stxa	%o4, [%o1]ASI_MEM
+	add	%o1, 8, %o1
+	stxa	%o5, [%o1]ASI_MEM
+
+	retl
+	  wrpr	  %g0, %g1, %pstate
+
+	SET_SIZE(hw_pa_bcopy32)
+
+#endif /* lint */
+
+#if defined(lint)
+
+int use_hw_bcopy = 1;
+int use_hw_bzero = 1;
+uint_t hw_copy_limit_1 = 0;
+uint_t hw_copy_limit_2 = 0;
+uint_t hw_copy_limit_4 = 0;
+uint_t hw_copy_limit_8 = 0;
+
+#else /* !lint */
+
+	DGDEF(use_hw_bcopy)
+	.word	1
+	DGDEF(use_hw_bzero)
+	.word	1
+	DGDEF(hw_copy_limit_1)
+	.word	0
+	DGDEF(hw_copy_limit_2)
+	.word	0
+	DGDEF(hw_copy_limit_4)
+	.word	0
+	DGDEF(hw_copy_limit_8)
+	.word	0
+
+	.align	64
+	.section ".text"
+#endif /* !lint */
diff --git a/usr/src/uts/sun4u/cpu/common_asm.s b/usr/src/uts/sun4u/cpu/common_asm.s
new file mode 100644
index 0000000000..e434e4cca6
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/common_asm.s
@@ -0,0 +1,1333 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* !lint */
+
+/*
+ * General assembly language routines.
+ * It is the intent of this file to contain routines that are
+ * specific to cpu architecture.
+ */
+
+/*
+ * WARNING: If you add a fast trap handler which can be invoked by a
+ * non-privileged user, you may have to use the FAST_TRAP_DONE macro
+ * instead of "done" instruction to return back to the user mode. See
+ * comments for the "fast_trap_done" entry point for more information.
+ */
+#define	FAST_TRAP_DONE	\
+	ba,a	fast_trap_done
+
+/*
+ * Override GET_NATIVE_TIME for the cpu module code.  This is not
+ * guaranteed to be exactly one instruction, be careful of using
+ * the macro in delay slots.
+ *
+ * Do not use any instruction that modifies condition codes as the 
+ * caller may depend on these to remain unchanged across the macro.
+ */
+#if defined(CHEETAH)
+
+#define	GET_NATIVE_TIME(out, scr1, scr2) \
+	rd	STICK, out
+#define	DELTA_NATIVE_TIME(delta, reg, scr1, scr2, scr3) \
+	rd	STICK, reg;		\
+	add	reg, delta, reg;	\
+	wr	reg, STICK
+#define	RD_TICKCMPR(out, scr)		\
+	rd	STICK_COMPARE, out
+#define	WR_TICKCMPR(in, scr1, scr2, label) \
+	wr	in, STICK_COMPARE
+
+#elif defined(HUMMINGBIRD)
+#include <sys/spitregs.h>
+
+/*
+ * the current hummingbird version of %stick and %stick_cmp
+ * were both implemented as (2) 32-bit locations in ASI_IO space;
+ * the hdwr should support atomic r/w; meanwhile: ugly alert! ...
+ *
+ * 64-bit opcodes are required, but move only 32-bits:
+ *
+ * ldxa [phys]ASI_IO, %dst 	reads  the low 32-bits from phys into %dst
+ * stxa %src, [phys]ASI_IO 	writes the low 32-bits from %src into phys
+ *
+ * reg equivalent		[phys]ASI_IO
+ * ------------------		---------------
+ * %stick_cmp  low-32		0x1FE.0000.F060
+ * %stick_cmp high-32		0x1FE.0000.F068
+ * %stick      low-32		0x1FE.0000.F070
+ * %stick     high-32		0x1FE.0000.F078
+ */
+#define	HSTC_LOW	0x60			/* stick_cmp low  32-bits */
+#define	HSTC_HIGH	0x68			/* stick_cmp high 32-bits */
+#define	HST_LOW		0x70			/* stick low  32-bits */
+#define	HST_HIGH	0x78			/* stick high 32-bits */
+#define	HST_DIFF	0x08			/* low<-->high diff */
+
+/*
+ * Any change in the number of instructions in SETL41()
+ * will affect SETL41_OFF
+ */
+#define	SETL41(reg, byte) \
+	sethi	%hi(0x1FE00000), reg;		/* 0000.0000.1FE0.0000 */ \
+	or	reg, 0xF, reg;			/* 0000.0000.1FE0.000F */ \
+	sllx	reg, 12, reg;			/* 0000.01FE.0000.F000 */ \
+	or	reg, byte, reg;			/* 0000.01FE.0000.F0xx */
+
+/*
+ * SETL41_OFF is used to calulate the relative PC value when a
+ * branch instruction needs to go over SETL41() macro
+ */
+#define SETL41_OFF  16
+
+/*
+ * reading stick requires 2 loads, and there could be an intervening
+ * low-to-high 32-bit rollover resulting in a return value that is
+ * off by about (2 ^ 32); this rare case is prevented by re-reading
+ * the low-32 bits after the high-32 and verifying the "after" value
+ * is >= the "before" value; if not, increment the high-32 value.
+ *
+ * this method is limited to 1 rollover, and based on the fixed
+ * stick-frequency (5555555), requires the loads to complete within
+ * 773 seconds; incrementing the high-32 value will not overflow for
+ * about 52644 years.
+ *
+ * writing stick requires 2 stores; if the old/new low-32 value is
+ * near 0xffffffff, there could be another rollover (also rare).
+ * to prevent this, we first write a 0 to the low-32, then write
+ * new values to the high-32 then the low-32.
+ *
+ * When we detect a carry in the lower %stick register, we need to
+ * read HST_HIGH again. However at the point where we detect this,
+ * we need to rebuild the register address HST_HIGH.This involves more
+ * than one instructions and a branch is unavoidable. However, most of
+ * the time, there is no carry. So we take the penalty of a branch
+ * instruction only when there is carry (less frequent).
+ * 
+ * For GET_NATIVE_TIME(), we start afresh and branch to SETL41().
+ * For DELTA_NATIVE_TIME(), we branch to just after SETL41() since
+ * addr already points to HST_LOW.
+ *
+ * NOTE: this method requires disabling interrupts before using
+ * DELTA_NATIVE_TIME.
+ */
+#define	GET_NATIVE_TIME(out, scr, tmp)	\
+	SETL41(scr, HST_LOW);		\
+	ldxa	[scr]ASI_IO, tmp;	\
+	inc	HST_DIFF, scr;		\
+	ldxa	[scr]ASI_IO, out;	\
+	dec	HST_DIFF, scr;		\
+	ldxa	[scr]ASI_IO, scr;	\
+	sub	scr, tmp, tmp;		\
+	brlz,pn tmp, .-(SETL41_OFF+24); \
+	sllx	out, 32, out;		\
+	or	out, scr, out
+#define	DELTA_NATIVE_TIME(delta, addr, high, low, tmp) \
+	SETL41(addr, HST_LOW);		\
+	ldxa	[addr]ASI_IO, tmp;	\
+	inc	HST_DIFF, addr;		\
+	ldxa	[addr]ASI_IO, high;	\
+	dec	HST_DIFF, addr;		\
+	ldxa	[addr]ASI_IO, low;	\
+	sub	low, tmp, tmp;		\
+	brlz,pn tmp, .-24;		\
+	sllx	high, 32, high;		\
+	or	high, low, high;	\
+	add	high, delta, high;	\
+	srl	high, 0, low;		\
+	srlx	high, 32, high;		\
+	stxa	%g0, [addr]ASI_IO;	\
+	inc	HST_DIFF, addr;		\
+	stxa	high, [addr]ASI_IO;	\
+	dec	HST_DIFF, addr;		\
+	stxa	low, [addr]ASI_IO
+#define RD_TICKCMPR(out, scr)		\
+	SETL41(scr, HSTC_LOW);		\
+	ldxa	[scr]ASI_IO, out;	\
+	inc	HST_DIFF, scr;		\
+	ldxa	[scr]ASI_IO, scr;	\
+	sllx	scr, 32, scr;		\
+	or	scr, out, out
+#define WR_TICKCMPR(in, scra, scrd, label) \
+	SETL41(scra, HSTC_HIGH);	\
+	srlx	in, 32, scrd;		\
+	stxa	scrd, [scra]ASI_IO;	\
+	dec	HST_DIFF, scra;		\
+	stxa	in, [scra]ASI_IO
+
+#else	/* !CHEETAH && !HUMMINGBIRD */
+
+#define	GET_NATIVE_TIME(out, scr1, scr2) \
+	rdpr	%tick, out
+#define	DELTA_NATIVE_TIME(delta, reg, scr1, scr2, scr3) \
+	rdpr	%tick, reg;		\
+	add	reg, delta, reg;	\
+	wrpr	reg, %tick
+#define	RD_TICKCMPR(out, scr)		\
+	rd	TICK_COMPARE, out
+#ifdef BB_ERRATA_1 /* writes to TICK_COMPARE may fail */
+/*
+ * Writes to the TICK_COMPARE register sometimes fail on blackbird modules.
+ * The failure occurs only when the following instruction decodes to wr or
+ * wrpr.  The workaround is to immediately follow writes to TICK_COMPARE
+ * with a read, thus stalling the pipe and keeping following instructions
+ * from causing data corruption.  Aligning to a quadword will ensure these
+ * two instructions are not split due to i$ misses.
+ */
+#define WR_TICKCMPR(cmpr,scr1,scr2,label)	\
+	ba,a	.bb_errata_1.label		;\
+	.align	64				;\
+.bb_errata_1.label:				;\
+	wr	cmpr, TICK_COMPARE		;\
+	rd	TICK_COMPARE, %g0
+#else	/* BB_ERRATA_1 */
+#define	WR_TICKCMPR(in,scr1,scr2,label)		\
+	wr	in, TICK_COMPARE
+#endif	/* BB_ERRATA_1 */
+
+#endif	/* !CHEETAH && !HUMMINGBIRD */
+
+#include <sys/clock.h>
+
+#if defined(lint)
+#include <sys/types.h>
+#include <sys/scb.h>
+#include <sys/systm.h>
+#include <sys/regset.h>
+#include <sys/sunddi.h>
+#include <sys/lockstat.h>
+#endif	/* lint */
+
+
+#include <sys/asm_linkage.h>
+#include <sys/privregs.h>
+#include <sys/machparam.h>	/* To get SYSBASE and PAGESIZE */
+#include <sys/machthread.h>
+#include <sys/clock.h>
+#include <sys/intreg.h>
+#include <sys/psr_compat.h>
+#include <sys/isa_defs.h>
+#include <sys/dditypes.h>
+#include <sys/intr.h>
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* !lint */
+
+#if defined(lint)
+
+uint_t
+get_impl(void)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(get_impl)
+	GET_CPU_IMPL(%o0)
+	retl
+	nop
+	SET_SIZE(get_impl)
+
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ * Softint generated when counter field of tick reg matches value field 
+ * of tick_cmpr reg
+ */
+/*ARGSUSED*/
+void
+tickcmpr_set(uint64_t clock_cycles)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(tickcmpr_set)
+	! get 64-bit clock_cycles interval
+	mov	%o0, %o2
+	mov	8, %o3			! A reasonable initial step size
+1:
+	WR_TICKCMPR(%o2,%o4,%o5,__LINE__)	! Write to TICK_CMPR
+
+	GET_NATIVE_TIME(%o0, %o4, %o5)	! Read %tick to confirm the
+	sllx	%o0, 1, %o0		!   value we wrote was in the future.
+	srlx	%o0, 1, %o0
+
+	cmp	%o2, %o0		! If the value we wrote was in the
+	bg,pt	%xcc, 2f		!   future, then blow out of here.
+	sllx	%o3, 1, %o3		! If not, then double our step size,
+	ba,pt	%xcc, 1b		!   and take another lap.
+	add	%o0, %o3, %o2		!
+2:
+	retl
+	nop
+	SET_SIZE(tickcmpr_set)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+void
+tickcmpr_disable(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(tickcmpr_disable)
+	mov	1, %g1
+	sllx	%g1, TICKINT_DIS_SHFT, %o0
+	WR_TICKCMPR(%o0,%o4,%o5,__LINE__)	! Write to TICK_CMPR
+	retl
+	nop
+	SET_SIZE(tickcmpr_disable)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+/*
+ * tick_write_delta() increments %tick by the specified delta.  This should
+ * only be called after a CPR event to assure that gethrtime() continues to
+ * increase monotonically.  Obviously, writing %tick needs to de done very
+ * carefully to avoid introducing unnecessary %tick skew across CPUs.  For
+ * this reason, we make sure we're i-cache hot before actually writing to
+ * %tick.
+ */
+/*ARGSUSED*/
+void
+tick_write_delta(uint64_t delta)
+{}
+
+#else	/* lint */
+
+#ifdef DEBUG
+	.seg	".text"
+tick_write_panic:
+	.asciz	"tick_write_delta: interrupts already disabled on entry"
+#endif	/* DEBUG */
+
+	ENTRY_NP(tick_write_delta)
+	rdpr	%pstate, %g1
+#ifdef DEBUG
+	andcc	%g1, PSTATE_IE, %g0	! If DEBUG, check that interrupts
+	bnz	0f			! aren't already disabled.
+	sethi	%hi(tick_write_panic), %o1
+        save    %sp, -SA(MINFRAME), %sp ! get a new window to preserve caller
+	call	panic
+	or	%i1, %lo(tick_write_panic), %o0
+#endif	/* DEBUG */
+0:	wrpr	%g1, PSTATE_IE, %pstate	! Disable interrupts
+	mov	%o0, %o2
+	ba	0f			! Branch to cache line-aligned instr.
+	nop
+	.align	16
+0:	nop				! The next 3 instructions are now hot.
+	DELTA_NATIVE_TIME(%o2, %o3, %o4, %o5, %g2)	! read/inc/write %tick
+
+	retl				! Return
+	wrpr	%g0, %g1, %pstate	!     delay: Re-enable interrupts
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ *  return 1 if disabled
+ */
+
+int
+tickcmpr_disabled(void)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY_NP(tickcmpr_disabled)
+	RD_TICKCMPR(%g1, %o0)
+	retl
+	srlx	%g1, TICKINT_DIS_SHFT, %o0
+	SET_SIZE(tickcmpr_disabled)
+
+#endif	/* lint */
+
+/*
+ * Get current tick
+ */
+#if defined(lint)
+
+u_longlong_t
+gettick(void)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(gettick)
+	GET_NATIVE_TIME(%o0, %o2, %o3)
+	retl
+	nop
+	SET_SIZE(gettick)
+
+#endif	/* lint */
+
+
+/*
+ * Return the counter portion of the tick register.
+ */
+
+#if defined(lint)
+
+uint64_t
+gettick_counter(void)
+{ return(0); }
+
+#else	/* lint */
+
+	ENTRY_NP(gettick_counter)
+	rdpr	%tick, %o0
+	sllx	%o0, 1, %o0
+	retl
+	srlx	%o0, 1, %o0		! shake off npt bit
+	SET_SIZE(gettick_counter)
+#endif	/* lint */
+
+/*
+ * Provide a C callable interface to the trap that reads the hi-res timer.
+ * Returns 64-bit nanosecond timestamp in %o0 and %o1.
+ */
+
+#if defined(lint)
+
+hrtime_t
+gethrtime(void)
+{
+	return ((hrtime_t)0);
+}
+
+hrtime_t
+gethrtime_unscaled(void)
+{
+	return ((hrtime_t)0);
+}
+
+hrtime_t
+gethrtime_max(void)
+{
+	return ((hrtime_t)0);
+}
+
+void
+scalehrtime(hrtime_t *hrt)
+{
+	*hrt = 0;
+}
+
+void
+gethrestime(timespec_t *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = 0;
+}
+
+time_t
+gethrestime_sec(void)
+{
+	return (0);
+}
+
+void
+gethrestime_lasttick(timespec_t *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = 0;
+}
+
+/*ARGSUSED*/
+void
+hres_tick(void)
+{
+}
+
+void
+panic_hres_tick(void)
+{
+}
+
+#else	/* lint */
+
+	ENTRY_NP(gethrtime)
+	GET_HRTIME(%g1, %o0, %o1, %o2, %o3, %o4, %o5, %g2)
+							! %g1 = hrtime
+	retl
+	mov	%g1, %o0
+	SET_SIZE(gethrtime)
+
+	ENTRY_NP(gethrtime_unscaled)
+	GET_NATIVE_TIME(%g1, %o2, %o3)			! %g1 = native time
+	retl
+	mov	%g1, %o0
+	SET_SIZE(gethrtime_unscaled)
+
+	ENTRY_NP(gethrtime_waitfree)
+	ALTENTRY(dtrace_gethrtime)
+	GET_NATIVE_TIME(%g1, %o2, %o3)			! %g1 = native time
+	NATIVE_TIME_TO_NSEC(%g1, %o2, %o3)
+	retl
+	mov	%g1, %o0
+	SET_SIZE(dtrace_gethrtime)
+	SET_SIZE(gethrtime_waitfree)
+
+	ENTRY(gethrtime_max)
+	NATIVE_TIME_MAX(%g1)
+	NATIVE_TIME_TO_NSEC(%g1, %o0, %o1)
+
+	! hrtime_t's are signed, max hrtime_t must be positive
+	mov	-1, %o2
+	brlz,a	%g1, 1f
+	srlx	%o2, 1, %g1
+1:
+	retl
+	mov	%g1, %o0
+	SET_SIZE(gethrtime_max)
+
+	ENTRY(scalehrtime)
+	ldx	[%o0], %o1
+	NATIVE_TIME_TO_NSEC(%o1, %o2, %o3)
+	retl
+	stx	%o1, [%o0]
+	SET_SIZE(scalehrtime)
+
+/*
+ * Fast trap to return a timestamp, uses trap window, leaves traps
+ * disabled.  Returns a 64-bit nanosecond timestamp in %o0 and %o1.
+ *
+ * This is the handler for the ST_GETHRTIME trap.
+ */
+
+	ENTRY_NP(get_timestamp)
+	GET_HRTIME(%g1, %g2, %g3, %g4, %g5, %o0, %o1, %o2)	! %g1 = hrtime
+	srlx	%g1, 32, %o0				! %o0 = hi32(%g1)
+	srl	%g1, 0, %o1				! %o1 = lo32(%g1)
+	FAST_TRAP_DONE
+	SET_SIZE(get_timestamp)
+
+/*
+ * Macro to convert GET_HRESTIME() bits into a timestamp.
+ *
+ * We use two separate macros so that the platform-dependent GET_HRESTIME()
+ * can be as small as possible; CONV_HRESTIME() implements the generic part.
+ */
+#define	CONV_HRESTIME(hrestsec, hrestnsec, adj, nslt, nano) \
+	brz,pt	adj, 3f;		/* no adjustments, it's easy */	\
+	add	hrestnsec, nslt, hrestnsec; /* hrest.tv_nsec += nslt */	\
+	brlz,pn	adj, 2f;		/* if hrestime_adj negative */	\
+	srl	nslt, ADJ_SHIFT, nslt;	/* delay: nslt >>= 4 */		\
+	subcc	adj, nslt, %g0;		/* hrestime_adj - nslt/16 */	\
+	movg	%xcc, nslt, adj;	/* adj by min(adj, nslt/16) */	\
+	ba	3f;			/* go convert to sec/nsec */	\
+	add	hrestnsec, adj, hrestnsec; /* delay: apply adjustment */ \
+2:	addcc	adj, nslt, %g0;		/* hrestime_adj + nslt/16 */	\
+	bge,a,pt %xcc, 3f;		/* is adj less negative? */	\
+	add	hrestnsec, adj, hrestnsec; /* yes: hrest.nsec += adj */	\
+	sub	hrestnsec, nslt, hrestnsec; /* no: hrest.nsec -= nslt/16 */ \
+3:	cmp	hrestnsec, nano;	/* more than a billion? */	\
+	bl,pt	%xcc, 4f;		/* if not, we're done */	\
+	nop;				/* delay: do nothing :( */	\
+	add	hrestsec, 1, hrestsec;	/* hrest.tv_sec++; */		\
+	sub	hrestnsec, nano, hrestnsec; /* hrest.tv_nsec -= NANOSEC; */ \
+4:
+
+	ENTRY_NP(gethrestime)
+	GET_HRESTIME(%o1, %o2, %o3, %o4, %o5, %g1, %g2, %g3, %g4)
+	CONV_HRESTIME(%o1, %o2, %o3, %o4, %o5)
+	stn	%o1, [%o0]
+	retl
+	stn	%o2, [%o0 + CLONGSIZE]
+	SET_SIZE(gethrestime)
+
+/*
+ * Similar to gethrestime(), but gethrestime_sec() returns current hrestime
+ * seconds.
+ */
+	ENTRY_NP(gethrestime_sec)
+	GET_HRESTIME(%o0, %o2, %o3, %o4, %o5, %g1, %g2, %g3, %g4)
+	CONV_HRESTIME(%o0, %o2, %o3, %o4, %o5)
+	retl					! %o0 current hrestime seconds
+	nop
+	SET_SIZE(gethrestime_sec)
+
+/*
+ * Returns the hrestime on the last tick.  This is simpler than gethrestime()
+ * and gethrestime_sec():  no conversion is required.  gethrestime_lasttick()
+ * follows the same locking algorithm as GET_HRESTIME and GET_HRTIME,
+ * outlined in detail in clock.h.  (Unlike GET_HRESTIME/GET_HRTIME, we don't
+ * rely on load dependencies to effect the membar #LoadLoad, instead declaring
+ * it explicitly.)
+ */
+	ENTRY_NP(gethrestime_lasttick)
+	sethi	%hi(hres_lock), %o1
+0:
+	lduw	[%o1 + %lo(hres_lock)], %o2	! Load lock value
+	membar	#LoadLoad			! Load of lock must complete
+	andn	%o2, 1, %o2			! Mask off lowest bit	
+	ldn	[%o1 + %lo(hrestime)], %g1	! Seconds.
+	add	%o1, %lo(hrestime), %o4
+	ldn	[%o4 + CLONGSIZE], %g2		! Nanoseconds.
+	membar	#LoadLoad			! All loads must complete
+	lduw	[%o1 + %lo(hres_lock)], %o3	! Reload lock value
+	cmp	%o3, %o2			! If lock is locked or has
+	bne	0b				!   changed, retry.
+	stn	%g1, [%o0]			! Delay: store seconds
+	retl
+	stn	%g2, [%o0 + CLONGSIZE]		! Delay: store nanoseconds
+	SET_SIZE(gethrestime_lasttick)
+
+/*
+ * Fast trap for gettimeofday().  Returns a timestruc_t in %o0 and %o1.
+ *
+ * This is the handler for the ST_GETHRESTIME trap.
+ */
+
+	ENTRY_NP(get_hrestime)
+	GET_HRESTIME(%o0, %o1, %g1, %g2, %g3, %g4, %g5, %o2, %o3)
+	CONV_HRESTIME(%o0, %o1, %g1, %g2, %g3)
+	FAST_TRAP_DONE
+	SET_SIZE(get_hrestime)
+
+/*
+ * Fast trap to return lwp virtual time, uses trap window, leaves traps
+ * disabled.  Returns a 64-bit number in %o0:%o1, which is the number
+ * of nanoseconds consumed.
+ *
+ * This is the handler for the ST_GETHRVTIME trap.
+ *
+ * Register usage:
+ *	%o0, %o1 = return lwp virtual time
+ * 	%o2 = CPU/thread
+ * 	%o3 = lwp
+ * 	%g1 = scratch
+ * 	%g5 = scratch
+ */
+	ENTRY_NP(get_virtime)
+	GET_NATIVE_TIME(%g5, %g1, %g2)	! %g5 = native time in ticks
+	CPU_ADDR(%g2, %g3)			! CPU struct ptr to %g2
+	ldn	[%g2 + CPU_THREAD], %g2		! thread pointer to %g2
+	ldn	[%g2 + T_LWP], %g3		! lwp pointer to %g3
+
+	/*
+	 * Subtract start time of current microstate from time
+	 * of day to get increment for lwp virtual time.
+	 */
+	ldx	[%g3 + LWP_STATE_START], %g1	! ms_state_start
+	sub	%g5, %g1, %g5
+
+	/*
+	 * Add current value of ms_acct[LMS_USER]
+	 */
+	ldx	[%g3 + LWP_ACCT_USER], %g1	! ms_acct[LMS_USER]
+	add	%g5, %g1, %g5
+	NATIVE_TIME_TO_NSEC(%g5, %g1, %o0) 
+	
+	srl	%g5, 0, %o1			! %o1 = lo32(%g5)
+	srlx	%g5, 32, %o0			! %o0 = hi32(%g5)
+
+	FAST_TRAP_DONE
+	SET_SIZE(get_virtime)
+
+
+
+	.seg	".text"
+hrtime_base_panic:
+	.asciz	"hrtime_base stepping back"
+
+
+	ENTRY_NP(hres_tick)
+	save	%sp, -SA(MINFRAME), %sp	! get a new window
+
+	sethi	%hi(hrestime), %l4
+	ldstub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5	! try locking
+7:	tst	%l5
+	bz,pt	%xcc, 8f			! if we got it, drive on
+	ld	[%l4 + %lo(nsec_scale)], %l5	! delay: %l5 = scaling factor
+	ldub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
+9:	tst	%l5
+	bz,a,pn	%xcc, 7b
+	ldstub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
+	ba,pt	%xcc, 9b
+	ldub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
+8:
+	membar	#StoreLoad|#StoreStore
+
+	!
+	! update hres_last_tick.  %l5 has the scaling factor (nsec_scale).
+	!
+	ldx	[%l4 + %lo(hrtime_base)], %g1	! load current hrtime_base
+	GET_NATIVE_TIME(%l0, %l3, %l6)		! current native time
+	stx	%l0, [%l4 + %lo(hres_last_tick)]! prev = current
+	! convert native time to nsecs
+	NATIVE_TIME_TO_NSEC_SCALE(%l0, %l5, %l2, NSEC_SHIFT)
+
+	sub	%l0, %g1, %i1			! get accurate nsec delta
+
+	ldx	[%l4 + %lo(hrtime_base)], %l1	
+	cmp	%l1, %l0
+	bg,pn	%xcc, 9f
+	nop
+
+	stx	%l0, [%l4 + %lo(hrtime_base)]	! update hrtime_base
+
+	!
+	! apply adjustment, if any
+	!
+	ldx	[%l4 + %lo(hrestime_adj)], %l0	! %l0 = hrestime_adj
+	brz	%l0, 2f
+						! hrestime_adj == 0 ?
+						! yes, skip adjustments
+	clr	%l5				! delay: set adj to zero
+	tst	%l0				! is hrestime_adj >= 0 ?
+	bge,pt	%xcc, 1f			! yes, go handle positive case
+	srl	%i1, ADJ_SHIFT, %l5		! delay: %l5 = adj
+
+	addcc	%l0, %l5, %g0			! hrestime_adj < -adj ?
+	bl,pt	%xcc, 2f			! yes, use current adj
+	neg	%l5				! delay: %l5 = -adj
+	ba,pt	%xcc, 2f
+	mov	%l0, %l5			! no, so set adj = hrestime_adj
+1:
+	subcc	%l0, %l5, %g0			! hrestime_adj < adj ?
+	bl,a,pt	%xcc, 2f			! yes, set adj = hrestime_adj
+	mov	%l0, %l5			! delay: adj = hrestime_adj
+2:
+	ldx	[%l4 + %lo(timedelta)], %l0	! %l0 = timedelta
+	sub	%l0, %l5, %l0			! timedelta -= adj
+
+	stx	%l0, [%l4 + %lo(timedelta)]	! store new timedelta
+	stx	%l0, [%l4 + %lo(hrestime_adj)]	! hrestime_adj = timedelta
+
+	or	%l4, %lo(hrestime), %l2
+	ldn	[%l2], %i2			! %i2:%i3 = hrestime sec:nsec
+	ldn	[%l2 + CLONGSIZE], %i3
+	add	%i3, %l5, %i3			! hrestime.nsec += adj
+	add	%i3, %i1, %i3			! hrestime.nsec += nslt
+
+	set	NANOSEC, %l5			! %l5 = NANOSEC
+	cmp	%i3, %l5
+	bl,pt	%xcc, 5f			! if hrestime.tv_nsec < NANOSEC
+	sethi	%hi(one_sec), %i1		! delay
+	add	%i2, 0x1, %i2			! hrestime.tv_sec++
+	sub	%i3, %l5, %i3			! hrestime.tv_nsec - NANOSEC
+	mov	0x1, %l5
+	st	%l5, [%i1 + %lo(one_sec)]
+5:
+	stn	%i2, [%l2]
+	stn	%i3, [%l2 + CLONGSIZE]		! store the new hrestime
+
+	membar	#StoreStore
+
+	ld	[%l4 + %lo(hres_lock)], %i1
+	inc	%i1				! release lock
+	st	%i1, [%l4 + %lo(hres_lock)]	! clear hres_lock
+
+	ret
+	restore
+
+9:
+	!
+	! release hres_lock
+	!
+	ld	[%l4 + %lo(hres_lock)], %i1
+	inc	%i1
+	st	%i1, [%l4 + %lo(hres_lock)]
+
+	sethi	%hi(hrtime_base_panic), %o0
+	call	panic
+	or	%o0, %lo(hrtime_base_panic), %o0
+
+	SET_SIZE(hres_tick)
+
+#endif	/* lint */
+
+#if !defined(lint) && !defined(__lint)
+
+	.seg	".text"
+kstat_q_panic_msg:
+	.asciz	"kstat_q_exit: qlen == 0"
+
+	ENTRY(kstat_q_panic)
+	save	%sp, -SA(MINFRAME), %sp
+	sethi	%hi(kstat_q_panic_msg), %o0
+	call	panic
+	or	%o0, %lo(kstat_q_panic_msg), %o0
+	/*NOTREACHED*/
+	SET_SIZE(kstat_q_panic)
+
+#define	BRZPN	brz,pn
+#define	BRZPT	brz,pt
+
+#define	KSTAT_Q_UPDATE(QOP, QBR, QZERO, QRETURN, QTYPE) \
+	ld	[%o0 + QTYPE/**/CNT], %o1;	/* %o1 = old qlen */	\
+	QOP	%o1, 1, %o2;			/* %o2 = new qlen */	\
+	QBR	%o1, QZERO;			/* done if qlen == 0 */	\
+	st	%o2, [%o0 + QTYPE/**/CNT];	/* delay: save qlen */	\
+	ldx	[%o0 + QTYPE/**/LASTUPDATE], %o3;			\
+	ldx	[%o0 + QTYPE/**/TIME], %o4;	/* %o4 = old time */	\
+	ldx	[%o0 + QTYPE/**/LENTIME], %o5;	/* %o5 = old lentime */	\
+	sub	%g1, %o3, %o2;			/* %o2 = time delta */	\
+	mulx	%o1, %o2, %o3;			/* %o3 = cur lentime */	\
+	add	%o4, %o2, %o4;			/* %o4 = new time */	\
+	add	%o5, %o3, %o5;			/* %o5 = new lentime */	\
+	stx	%o4, [%o0 + QTYPE/**/TIME];	/* save time */		\
+	stx	%o5, [%o0 + QTYPE/**/LENTIME];	/* save lentime */	\
+QRETURN;								\
+	stx	%g1, [%o0 + QTYPE/**/LASTUPDATE]; /* lastupdate = now */
+
+	.align 16
+	ENTRY(kstat_waitq_enter)
+	GET_NATIVE_TIME(%g1, %g2, %g3)
+	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_W)
+	SET_SIZE(kstat_waitq_enter)
+
+	.align 16
+	ENTRY(kstat_waitq_exit)
+	GET_NATIVE_TIME(%g1, %g2, %g3)
+	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, retl, KSTAT_IO_W)
+	SET_SIZE(kstat_waitq_exit)
+
+	.align 16
+	ENTRY(kstat_runq_enter)
+	GET_NATIVE_TIME(%g1, %g2, %g3)
+	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_R)
+	SET_SIZE(kstat_runq_enter)
+
+	.align 16
+	ENTRY(kstat_runq_exit)
+	GET_NATIVE_TIME(%g1, %g2, %g3)
+	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, retl, KSTAT_IO_R)
+	SET_SIZE(kstat_runq_exit)
+
+	.align 16
+	ENTRY(kstat_waitq_to_runq)
+	GET_NATIVE_TIME(%g1, %g2, %g3)
+	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, 1:, KSTAT_IO_W)
+	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_R)
+	SET_SIZE(kstat_waitq_to_runq)
+
+	.align 16
+	ENTRY(kstat_runq_back_to_waitq)
+	GET_NATIVE_TIME(%g1, %g2, %g3)
+	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, 1:, KSTAT_IO_R)
+	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_W)
+	SET_SIZE(kstat_runq_back_to_waitq)
+
+#endif	/* !(lint || __lint) */
+
+#ifdef lint	
+
+int64_t timedelta;
+hrtime_t hres_last_tick;
+timestruc_t hrestime;
+int64_t hrestime_adj;
+int hres_lock;
+uint_t nsec_scale;
+hrtime_t hrtime_base;
+int traptrace_use_stick;
+
+#else	/* lint */
+	/*
+	 *  -- WARNING --
+	 *
+	 * The following variables MUST be together on a 128-byte boundary.
+	 * In addition to the primary performance motivation (having them all
+	 * on the same cache line(s)), code here and in the GET*TIME() macros
+	 * assumes that they all have the same high 22 address bits (so
+	 * there's only one sethi).
+	 */
+	.seg	".data"
+	.global	timedelta, hres_last_tick, hrestime, hrestime_adj
+	.global	hres_lock, nsec_scale, hrtime_base, traptrace_use_stick
+	.global	nsec_shift, adj_shift
+
+	/* XXX - above comment claims 128-bytes is necessary */
+	.align	64
+timedelta:
+	.word	0, 0		/* int64_t */
+hres_last_tick:
+	.word	0, 0		/* hrtime_t */
+hrestime:
+	.nword	0, 0		/* 2 longs */
+hrestime_adj:
+	.word	0, 0		/* int64_t */
+hres_lock:
+	.word	0
+nsec_scale:
+	.word	0
+hrtime_base:
+	.word	0, 0
+traptrace_use_stick:
+	.word	0
+nsec_shift:
+	.word	NSEC_SHIFT
+adj_shift:
+	.word	ADJ_SHIFT
+
+#endif	/* lint */
+
+
+/*
+ * drv_usecwait(clock_t n)	[DDI/DKI - section 9F]
+ * usec_delay(int n)		[compatibility - should go one day]
+ * Delay by spinning.
+ *
+ * delay for n microseconds.  numbers <= 0 delay 1 usec
+ *
+ * With UltraSPARC-III the combination of supporting mixed-speed CPUs
+ * and variable clock rate for power management requires that we
+ * use %stick to implement this routine.
+ */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+drv_usecwait(clock_t n)
+{}
+
+/*ARGSUSED*/
+void
+usec_delay(int n)
+{}
+
+#else	/* lint */
+
+	ENTRY(drv_usecwait)
+	ALTENTRY(usec_delay)
+	brlez,a,pn %o0, 0f
+	mov	1, %o0
+0:
+	sethi	%hi(sticks_per_usec), %o1
+	lduw	[%o1 + %lo(sticks_per_usec)], %o1
+	mulx	%o1, %o0, %o1		! Scale usec to ticks
+	inc	%o1			! We don't start on a tick edge
+	GET_NATIVE_TIME(%o2, %o3, %o4)
+	add	%o1, %o2, %o1
+
+1:	cmp	%o1, %o2
+	GET_NATIVE_TIME(%o2, %o3, %o4)
+	bgeu,pt	%xcc, 1b
+	nop
+	retl
+	nop
+	SET_SIZE(usec_delay)
+	SET_SIZE(drv_usecwait)
+#endif	/* lint */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+pil14_interrupt(int level)
+{}
+
+#else	/* lint */
+
+/*
+ * Level-14 interrupt prologue.
+ */
+	ENTRY_NP(pil14_interrupt)
+	CPU_ADDR(%g1, %g2)
+	rdpr	%pil, %g6			! %g6 = interrupted PIL
+	stn	%g6, [%g1 + CPU_PROFILE_PIL]	! record interrupted PIL
+	rdpr	%tstate, %g6
+	rdpr	%tpc, %g5
+	btst	TSTATE_PRIV, %g6		! trap from supervisor mode?
+	bnz,a,pt %xcc, 1f
+	stn	%g5, [%g1 + CPU_PROFILE_PC]	! if so, record kernel PC
+	stn	%g5, [%g1 + CPU_PROFILE_UPC]	! if not, record user PC
+	ba	pil_interrupt_common		! must be large-disp branch
+	stn	%g0, [%g1 + CPU_PROFILE_PC]	! zero kernel PC
+1:	ba	pil_interrupt_common		! must be large-disp branch
+	stn	%g0, [%g1 + CPU_PROFILE_UPC]	! zero user PC
+	SET_SIZE(pil14_interrupt)
+
+	ENTRY_NP(tick_rtt)
+	!
+	! Load TICK_COMPARE into %o5; if bit 63 is set, then TICK_COMPARE is
+	! disabled.  If TICK_COMPARE is enabled, we know that we need to
+	! reenqueue the interrupt request structure.  We'll then check TICKINT
+	! in SOFTINT; if it's set, then we know that we were in a TICK_COMPARE
+	! interrupt.  In this case, TICK_COMPARE may have been rewritten
+	! recently; we'll compare %o5 to the current time to verify that it's
+	! in the future.  
+	!
+	! Note that %o5 is live until after 1f.
+	! XXX - there is a subroutine call while %o5 is live!
+	!
+	RD_TICKCMPR(%o5, %g1)
+	srlx	%o5, TICKINT_DIS_SHFT, %g1
+	brnz,pt	%g1, 2f
+	nop
+
+	rdpr 	%pstate, %g5
+	andn	%g5, PSTATE_IE, %g1
+	wrpr	%g0, %g1, %pstate		! Disable vec interrupts
+
+	sethi	%hi(cbe_level14_inum), %o1
+	ld	[%o1 + %lo(cbe_level14_inum)], %o1
+	call	intr_enqueue_req ! preserves %o5 and %g5
+	mov	PIL_14, %o0
+
+	! Check SOFTINT for TICKINT/STICKINT
+	rd	SOFTINT, %o4
+	set	(TICK_INT_MASK | STICK_INT_MASK), %o0
+	andcc	%o4, %o0, %g0
+	bz,a,pn	%icc, 2f
+	wrpr	%g0, %g5, %pstate		! Enable vec interrupts
+
+	! clear TICKINT/STICKINT
+	wr	%o0, CLEAR_SOFTINT
+
+	!
+	! Now that we've cleared TICKINT, we can reread %tick and confirm
+	! that the value we programmed is still in the future.  If it isn't,
+	! we need to reprogram TICK_COMPARE to fire as soon as possible.
+	!
+	GET_NATIVE_TIME(%o0, %g1, %g2)		! %o0 = tick
+	sllx	%o0, 1, %o0			! Clear the DIS bit
+	srlx	%o0, 1, %o0
+	cmp	%o5, %o0			! In the future?
+	bg,a,pt	%xcc, 2f			! Yes, drive on.
+	wrpr	%g0, %g5, %pstate		!   delay: enable vec intr
+
+	!
+	! If we're here, then we have programmed TICK_COMPARE with a %tick
+	! which is in the past; we'll now load an initial step size, and loop
+	! until we've managed to program TICK_COMPARE to fire in the future.
+	!
+	mov	8, %o4				! 8 = arbitrary inital step
+1:	add	%o0, %o4, %o5			! Add the step
+	WR_TICKCMPR(%o5,%g1,%g2,__LINE__)	! Write to TICK_CMPR
+	GET_NATIVE_TIME(%o0, %g1, %g2)		! %o0 = tick
+	sllx	%o0, 1, %o0			! Clear the DIS bit
+	srlx	%o0, 1, %o0
+	cmp	%o5, %o0			! In the future?
+	bg,a,pt	%xcc, 2f			! Yes, drive on.
+	wrpr	%g0, %g5, %pstate		!    delay: enable vec intr
+	ba	1b				! No, try again.
+	sllx	%o4, 1, %o4			!    delay: double step size
+
+2:	ba	current_thread_complete
+	nop
+	SET_SIZE(tick_rtt)
+
+#endif	/* lint */
+
+#if defined(lint) || defined(__lint)
+
+/* ARGSUSED */
+uint64_t
+find_cpufrequency(volatile uchar_t *clock_ptr)
+{
+	return (0);
+}
+
+#else	/* lint */
+
+#ifdef DEBUG
+	.seg	".text"
+find_cpufreq_panic:
+	.asciz	"find_cpufrequency: interrupts already disabled on entry"
+#endif	/* DEBUG */
+
+	ENTRY_NP(find_cpufrequency)
+	rdpr	%pstate, %g1
+
+#ifdef DEBUG
+	andcc	%g1, PSTATE_IE, %g0	! If DEBUG, check that interrupts
+	bnz	0f			! are currently enabled
+	sethi	%hi(find_cpufreq_panic), %o1
+	call	panic
+	or	%o1, %lo(find_cpufreq_panic), %o0
+#endif	/* DEBUG */
+
+0:
+	wrpr	%g1, PSTATE_IE, %pstate	! Disable interrupts
+3:
+	ldub	[%o0], %o1		! Read the number of seconds
+	mov	%o1, %o2		! remember initial value in %o2
+1:
+	GET_NATIVE_TIME(%o3, %g4, %g5)
+	cmp	%o1, %o2		! did the seconds register roll over?
+	be,pt	%icc, 1b		! branch back if unchanged
+	ldub	[%o0], %o2		!   delay: load the new seconds val
+
+	brz,pn	%o2, 3b			! if the minutes just rolled over,
+					! the last second could have been
+					! inaccurate; try again.
+	mov	%o2, %o4		!   delay: store init. val. in %o2
+2:
+	GET_NATIVE_TIME(%o5, %g4, %g5)
+	cmp	%o2, %o4		! did the seconds register roll over?
+	be,pt	%icc, 2b		! branch back if unchanged
+	ldub	[%o0], %o4		!   delay: load the new seconds val
+
+	brz,pn	%o4, 0b			! if the minutes just rolled over,
+					! the last second could have been
+					! inaccurate; try again.
+	wrpr	%g0, %g1, %pstate	!   delay: re-enable interrupts
+
+	retl
+	sub	%o5, %o3, %o0		! return the difference in ticks
+	SET_SIZE(find_cpufrequency)
+
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ * Prefetch a page_t for write or read, this assumes a linear
+ * scan of sequential page_t's.
+ */
+/*ARGSUSED*/
+void
+prefetch_page_w(void *pp)
+{}
+
+/*ARGSUSED*/
+void
+prefetch_page_r(void *pp)
+{}
+#else	/* lint */
+
+#if defined(CHEETAH) || defined(CHEETAH_PLUS) || defined(JALAPENO) || \
+	defined(SERRANO)
+	!
+	! On US-III, the prefetch instruction queue is 8 entries deep.
+	! Also, prefetches for write put data in the E$, which has
+	! lines of 512 bytes for an 8MB cache. Each E$ line is further
+	! subblocked into 64 byte chunks.
+	!
+	! Since prefetch can only bring in 64 bytes at a time (See Sparc
+	! v9 Architecture Manual pp.204) and a page_t is 128 bytes,
+	! then 2 prefetches are required in order to bring an entire
+	! page into the E$.
+	!
+	! Since the prefetch queue is 8 entries deep, we currently can
+	! only have 4 prefetches for page_t's outstanding. Thus, we
+	! prefetch n+4 ahead of where we are now: 
+	!
+	!      4 * sizeof(page_t)     -> 512
+	!      4 * sizeof(page_t) +64 -> 576
+	! 
+	! Example
+	! =======
+	! contiguous page array in memory...
+	!
+	! |AAA1|AAA2|BBB1|BBB2|CCC1|CCC2|DDD1|DDD2|XXX1|XXX2|YYY1|YYY2|...
+	! ^         ^         ^         ^         ^    ^
+	! pp                                      |    pp+4*sizeof(page)+64
+	!                                         |
+	!                                         pp+4*sizeof(page)
+	!
+	!  Prefetch
+	!   Queue
+	! +-------+<--- In this iteration, we're working with pp (AAA1),
+	! |Preftch|     but we enqueue prefetch for addr = XXX1
+	! | XXX1  | 
+	! +-------+<--- this queue slot will be a prefetch instruction for
+	! |Preftch|     for addr = pp + 4*sizeof(page_t) + 64 (or second
+	! | XXX2  |     half of page XXX)
+	! +-------+ 
+	! |Preftch|<-+- The next time around this function, we'll be
+	! | YYY1  |  |  working with pp = BBB1, but will be enqueueing
+	! +-------+  |  prefetches to for both halves of page YYY,
+	! |Preftch|  |  while both halves of page XXX are in transit
+	! | YYY2  |<-+  make their way into the E$.
+	! +-------+
+	! |Preftch|
+	! | ZZZ1  |
+	! +-------+
+	! .       .
+	! :       :
+	!
+	!  E$
+	! +============================================...
+	! | XXX1 | XXX2 | YYY1 | YYY2 | ZZZ1 | ZZZ2 |
+	! +============================================...
+	! |      |      |      |      |      |      |
+	! +============================================...
+	! .
+	! :
+	!
+	! So we should expect the first four page accesses to stall
+	! while we warm up the cache, afterwhich, most of the pages
+	! will have their pp ready in the E$.
+	! 
+	! Also note that if sizeof(page_t) grows beyond 128, then 
+	! we'll need an additional prefetch to get an entire page
+	! into the E$, thus reducing the number of outstanding page
+	! prefetches to 2 (ie. 3 prefetches/page = 6 queue slots)
+	! etc.
+	!
+	! Cheetah+
+	! ========
+	! On Cheetah+ we use "#n_write" prefetches as these avoid
+	! unnecessary RTS->RTO bus transaction state change, and
+	! just issues RTO transaction. (See pp.77 of Cheetah+ Delta
+	! PRM). On Cheetah, #n_write prefetches are reflected with
+	! RTS->RTO state transition regardless.
+	!
+#define STRIDE1 512
+#define STRIDE2 576
+
+#if	STRIDE1 != (PAGE_SIZE * 4)
+#error	"STRIDE1 != (PAGE_SIZE * 4)"
+#endif	/* STRIDE1 != (PAGE_SIZE * 4) */
+
+        ENTRY(prefetch_page_w)
+        prefetch        [%o0+STRIDE1], #n_writes
+        retl
+        prefetch        [%o0+STRIDE2], #n_writes
+        SET_SIZE(prefetch_page_w)
+
+	!
+	! Note on CHEETAH to prefetch for read, we really use #one_write.
+	! This fetches to E$ (general use) rather than P$ (floating point use).
+	!
+        ENTRY(prefetch_page_r)
+        prefetch        [%o0+STRIDE1], #one_write
+        retl
+        prefetch        [%o0+STRIDE2], #one_write
+        SET_SIZE(prefetch_page_r)
+
+#elif defined(SPITFIRE) || defined(HUMMINGBIRD)
+
+	!
+	! UltraSparcII can have up to 3 prefetches outstanding.
+	! A page_t is 128 bytes (2 prefetches of 64 bytes each)
+	! So prefetch for pp + 1, which is
+	!
+	!       pp + sizeof(page_t)
+	! and
+	!       pp + sizeof(page_t) + 64
+	!
+#define STRIDE1	128
+#define STRIDE2	192
+
+#if	STRIDE1 != PAGE_SIZE
+#error	"STRIDE1 != PAGE_SIZE"
+#endif	/* STRIDE1 != PAGE_SIZE */
+
+        ENTRY(prefetch_page_w)
+        prefetch        [%o0+STRIDE1], #n_writes
+        retl
+        prefetch        [%o0+STRIDE2], #n_writes
+        SET_SIZE(prefetch_page_w)
+
+        ENTRY(prefetch_page_r)
+        prefetch        [%o0+STRIDE1], #n_reads
+        retl
+        prefetch        [%o0+STRIDE2], #n_reads
+        SET_SIZE(prefetch_page_r)
+#else	/* SPITFIRE || HUMMINGBIRD */
+
+#error "You need to fix this for your new cpu type."
+
+#endif	/* SPITFIRE || HUMMINGBIRD */
+
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ * Prefetch struct smap for write. 
+ */
+/*ARGSUSED*/
+void
+prefetch_smap_w(void *smp)
+{}
+#else	/* lint */
+
+#if defined(CHEETAH) || defined(CHEETAH_PLUS) || defined(JALAPENO) || \
+	defined(SERRANO)
+
+#define	PREFETCH_Q_LEN 8
+
+#elif defined(SPITFIRE) || defined(HUMMINGBIRD)
+
+#define	PREFETCH_Q_LEN 3
+
+#else	/* SPITFIRE || HUMMINGBIRD */
+
+#error You need to fix this for your new cpu type.
+
+#endif	/* SPITFIRE || HUMMINGBIRD */
+
+#include <vm/kpm.h>
+
+#ifdef	SEGKPM_SUPPORT
+
+#define	SMAP_SIZE 72
+#define SMAP_STRIDE (((PREFETCH_Q_LEN * 64) / SMAP_SIZE) * 64)
+
+#else	/* SEGKPM_SUPPORT */
+
+	!
+	! The hardware will prefetch the 64 byte cache aligned block
+	! that contains the address specified in the prefetch instruction.
+	! Since the size of the smap struct is 48 bytes, issuing 1 prefetch
+	! per pass will suffice as long as we prefetch far enough ahead to
+	! make sure we don't stall for the cases where the smap object
+	! spans multiple hardware prefetch blocks.  Let's prefetch as far
+	! ahead as the hardware will allow.
+	!
+	! The smap array is processed with decreasing address pointers.
+	!
+#define	SMAP_SIZE 48
+#define	SMAP_STRIDE (PREFETCH_Q_LEN * SMAP_SIZE)
+
+#endif	/* SEGKPM_SUPPORT */
+
+	ENTRY(prefetch_smap_w)
+	retl
+	prefetch	[%o0-SMAP_STRIDE], #n_writes
+	SET_SIZE(prefetch_smap_w)
+
+#endif	/* lint */
+
+#if defined(lint) || defined(__lint)
+
+/* ARGSUSED */
+uint64_t
+getidsr(void)
+{ return 0; }
+
+#else	/* lint */
+
+	ENTRY_NP(getidsr)
+	retl
+	ldxa	[%g0]ASI_INTR_DISPATCH_STATUS, %o0
+	SET_SIZE(getidsr)
+
+#endif	/* lint */
diff --git a/usr/src/uts/sun4u/cpu/mach_cpu_module.c b/usr/src/uts/sun4u/cpu/mach_cpu_module.c
new file mode 100644
index 0000000000..c9dc47c061
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/mach_cpu_module.c
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/cpu_module.h>
+#include <vm/page.h>
+#include <vm/seg_map.h>
+
+void
+cpu_fiximp(dnode_t dnode)
+{}
+
+void
+ce_err(void)
+{}
+
+void
+ce_err_tl1(void)
+{}
+
+void
+async_err(void)
+{}
+
+void
+cpu_flush_ecache(void)
+{}
+
+void
+cpu_disable_errors(void)
+{}
+
+/* It could be removed later if prom enables error handling */
+void
+cpu_enable_errors(void)
+{}
+
+/*ARGSUSED*/
+void
+cpu_faulted_enter(struct cpu *cp)
+{}
+
+/*ARGSUSED*/
+void
+cpu_faulted_exit(struct cpu *cp)
+{}
+
+/*ARGSUSED*/
+void
+cpu_ce_count_unum(struct async_flt *ecc, int len, char *unum)
+{}
+
+/*ARGSUSED*/
+void
+cpu_ce_scrub_mem_err(struct async_flt *ecc, boolean_t triedcpulogout)
+{}
+
+/*ARGSUSED*/
+void
+cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep)
+{}
+
+/*ARGSUSED*/
+void
+cpu_ue_log_err(struct async_flt *ecc)
+{}
+
+/*ARGSUSED*/
+int
+ce_scrub_xdiag_recirc(struct async_flt *aflt, errorq_t *eqp,
+    errorq_elem_t *eqep, size_t afltoffset)
+{ return (0); }
+
+/*ARGSUSED*/
+char *
+flt_to_error_type(struct async_flt *aflt)
+{ return (NULL); }
+
+int
+cpu_aflt_size(void)
+{ return (0); }
+
+void
+cpu_async_panic_callb(void)
+{}
+
+/*ARGSUSED*/
+void
+cpu_check_allcpus(struct async_flt *aflt)
+{}
+
+/*ARGSUSED*/
+int
+cpu_get_mem_unum(int synd_stat, ushort_t synd, uint64_t afsr, uint64_t afar,
+    int cpuid, int flt_in_memory, ushort_t flt_status, char *buf,
+    int buflen, int *lenp)
+{ return (ENOTSUP); }
+
+/*ARGSUSED*/
+int
+cpu_get_mem_unum_aflt(int synd_stat, struct async_flt *aflt,
+    char *buf, int buflen, int *lenp)
+{ return (ENOTSUP); }
+
+/*ARGSUSED*/
+int
+cpu_get_cpu_unum(int cpuid, char *buf, int buflen, int *lenp)
+{ return (ENOTSUP); }
+
+/*ARGSUSED*/
+int
+cpu_get_mem_name(uint64_t synd, uint64_t *afsr, uint64_t afar,
+    char *buf, int buflen, int *lenp)
+{ return (ENOTSUP); }
+
+/*ARGSUSED*/
+size_t
+cpu_get_name_bufsize()
+{ return (0); }
+
+/*ARGSUSED*/
+int
+cpu_get_mem_info(uint64_t synd, uint64_t afar,
+    uint64_t *mem_sizep, uint64_t *seg_sizep, uint64_t *bank_sizep,
+    int *segsp, int *banksp, int *mcidp)
+{ return (ENOTSUP); }
+
+/*ARGSUSED*/
+void
+cpu_ereport_post(struct async_flt *aflt)
+{}
+
+/*ARGSUSED*/
+void
+cpu_run_bus_error_handlers(struct async_flt *aflt, int expected)
+{}
+
+void
+cpu_errorq_dispatch(char *error_class, void *payload, size_t payload_sz,
+    errorq_t *eqp, uint_t flag)
+{}
+
+void
+clr_datapath(void)
+{}
+
+/*ARGSUSED*/
+void
+read_ecc_data(struct async_flt *ecc, short verbose, short ce_err)
+{}
+
+/*ARGSUSED*/
+void
+itlb_rd_entry(uint_t entry, tte_t *tte, uint64_t *va_tag)
+{}
+
+/*ARGSUSED*/
+void
+dtlb_rd_entry(uint_t entry, tte_t *tte, uint64_t *va_tag)
+{}
+
+/*
+ * tick operations
+ */
+
+void
+cpu_clearticknpt(void)
+{ }
+
+/*
+ * Ecache scrub operations
+ */
+void
+cpu_init_cache_scrub(void)
+{}
+
+/*ARGSUSED*/
+void
+cpu_busy_ecache_scrub(struct cpu *cp)
+{}
+
+/*ARGSUSED*/
+void
+cpu_idle_ecache_scrub(struct cpu *cp)
+{}
+
+/* ARGSUSED */
+void
+cpu_check_ce(int flag, uint64_t pa, caddr_t va, uint_t bpp)
+{}
+
+/* ARGSUSED */
+void
+prefetch_page_w(void *pp)
+{
+#define	ECACHE_SUBBLOCKS_PER_PAGE	2
+#define	ECACHE_SUBBLOCK_SIZE_BYTES	64
+#define	ECACHE_PAGE_BYTE_MAX	\
+	(ECACHE_SUBBLOCKS_PER_PAGE*ECACHE_SUBBLOCK_SIZE_BYTES+1)
+
+	/*
+	 * The following line is intended to cause an error
+	 * whenever the sun4u page_t grows beyond 128
+	 * bytes.
+	 *
+	 * If you get an error here, you'll need to change
+	 * the 'prefetch_page_w' assembly language code
+	 * (see also prefetch_page_w prologue comment)
+	 */
+	/*LINTED*/
+	volatile int garbage[ECACHE_PAGE_BYTE_MAX - sizeof (page_t)];
+}
+
+/* ARGSUSED */
+void
+prefetch_page_r(void *pp)
+{
+#define	ECACHE_SUBBLOCKS_PER_PAGE	2
+#define	ECACHE_SUBBLOCK_SIZE_BYTES	64
+#define	ECACHE_PAGE_BYTE_MAX	\
+	(ECACHE_SUBBLOCKS_PER_PAGE*ECACHE_SUBBLOCK_SIZE_BYTES+1)
+
+	/*
+	 * The following line is intended to cause an error
+	 * whenever the sun4u page_t grows beyond 128
+	 * bytes.
+	 *
+	 * If you get an error here, you'll need to change
+	 * the 'prefetch_page_r' assembly language code
+	 * (see also prefetch_page_w prologue comment)
+	 */
+	/*LINTED*/
+	volatile int garbage[ECACHE_PAGE_BYTE_MAX - sizeof (page_t)];
+}
+
+
+#ifdef	SEGKPM_SUPPORT
+#define	SMAP_SIZE	80
+#else
+#define	SMAP_SIZE	56
+#endif
+
+/* ARGSUSED */
+void
+prefetch_smap_w(void *smp)
+{
+
+	/*
+	 * The following lines are intended to cause an error
+	 * whenever the smap object size changes from the current
+	 * size of 48 bytes.  If you get an error here, you'll
+	 * need to update the code in the 'prefetch_smap_w' assembly
+	 * language code.
+	 */
+	/*LINTED*/
+	volatile int smap_size_changed [SMAP_SIZE - sizeof (struct smap) + 1];
+	volatile int smap_size_changed2 [sizeof (struct smap) - SMAP_SIZE + 1];
+}
+
+void
+kdi_flush_caches(void)
+{}
+
+/*ARGSUSED*/
+void
+mmu_init_kernel_pgsz(struct hat *hat)
+{
+}
+
+size_t
+mmu_get_kernel_lpsize(size_t value)
+{
+	return (value);
+}
diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c
new file mode 100644
index 0000000000..79dc7f16f9
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/spitfire.c
@@ -0,0 +1,4568 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/archsystm.h>
+#include <sys/machparam.h>
+#include <sys/machsystm.h>
+#include <sys/cpu.h>
+#include <sys/elf_SPARC.h>
+#include <vm/hat_sfmmu.h>
+#include <vm/page.h>
+#include <sys/cpuvar.h>
+#include <sys/spitregs.h>
+#include <sys/async.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dditypes.h>
+#include <sys/sunddi.h>
+#include <sys/cpu_module.h>
+#include <sys/prom_debug.h>
+#include <sys/vmsystm.h>
+#include <sys/prom_plat.h>
+#include <sys/sysmacros.h>
+#include <sys/intreg.h>
+#include <sys/machtrap.h>
+#include <sys/ontrap.h>
+#include <sys/ivintr.h>
+#include <sys/atomic.h>
+#include <sys/panic.h>
+#include <sys/ndifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/fm/cpu/UltraSPARC-II.h>
+#include <sys/ddi.h>
+#include <sys/ecc_kstat.h>
+#include <sys/watchpoint.h>
+#include <sys/dtrace.h>
+#include <sys/errclassify.h>
+
+uchar_t	*ctx_pgsz_array = NULL;
+
+/*
+ * Structure for the 8 byte ecache data dump and the associated AFSR state.
+ * There will be 8 of these structures used to dump an ecache line (64 bytes).
+ */
+typedef struct sf_ec_data_elm {
+	uint64_t ec_d8;
+	uint64_t ec_afsr;
+} ec_data_t;
+
+/*
+ * Define spitfire (Ultra I/II) specific asynchronous error structure
+ */
+typedef struct spitfire_async_flt {
+	struct async_flt cmn_asyncflt;	/* common - see sun4u/sys/async.h */
+	ushort_t flt_type;		/* types of faults - cpu specific */
+	ec_data_t flt_ec_data[8];	/* for E$ or mem dump/state */
+	uint64_t flt_ec_tag;		/* E$ tag info */
+	int flt_ec_lcnt;		/* number of bad E$ lines */
+	ushort_t flt_sdbh;		/* UDBH reg */
+	ushort_t flt_sdbl;		/* UDBL reg */
+} spitf_async_flt;
+
+/*
+ * Prototypes for support routines in spitfire_asm.s:
+ */
+extern void flush_ecache(uint64_t physaddr, size_t size, size_t linesize);
+extern uint64_t get_lsu(void);
+extern void set_lsu(uint64_t ncc);
+extern void get_ecache_dtag(uint32_t ecache_idx, uint64_t *data, uint64_t *tag,
+				uint64_t *oafsr, uint64_t *acc_afsr);
+extern uint64_t check_ecache_line(uint32_t id, uint64_t *acc_afsr);
+extern uint64_t get_ecache_tag(uint32_t id, uint64_t *nafsr,
+				uint64_t *acc_afsr);
+extern uint64_t read_and_clear_afsr();
+extern void write_ec_tag_parity(uint32_t id);
+extern void write_hb_ec_tag_parity(uint32_t id);
+
+/*
+ * Spitfire module routines:
+ */
+static void cpu_async_log_err(void *flt);
+/*PRINTFLIKE6*/
+static void cpu_aflt_log(int ce_code, int tagnum, spitf_async_flt *spflt,
+    uint_t logflags, const char *endstr, const char *fmt, ...);
+
+static void cpu_read_paddr(struct async_flt *aflt, short verbose, short ce_err);
+static void cpu_ce_log_status(spitf_async_flt *spf_flt, char *unum);
+static void cpu_log_ecmem_info(spitf_async_flt *spf_flt);
+
+static void log_ce_err(struct async_flt *aflt, char *unum);
+static void log_ue_err(struct async_flt *aflt, char *unum);
+static void check_misc_err(spitf_async_flt *spf_flt);
+static ushort_t ecc_gen(uint_t high_bytes, uint_t low_bytes);
+static int check_ecc(struct async_flt *aflt);
+static uint_t get_cpu_status(uint64_t arg);
+static uint64_t clear_errors(spitf_async_flt *spf_flt, uint64_t *acc_afsr);
+static void scan_ecache(uint64_t *afar, ec_data_t *data, uint64_t *tag,
+		int *m, uint64_t *afsr);
+static void ecache_kstat_init(struct cpu *cp);
+static void ecache_scrub_log(ec_data_t *ec_data, uint64_t ec_tag,
+		uint64_t paddr, int mpb, uint64_t);
+static uint64_t ecache_scrub_misc_err(int, uint64_t);
+static void ecache_scrub_tag_err(uint64_t, uchar_t, uint32_t);
+static void ecache_page_retire(void *);
+static int ecc_kstat_update(kstat_t *ksp, int rw);
+static int ce_count_unum(int status, int len, char *unum);
+static void add_leaky_bucket_timeout(void);
+static int synd_to_synd_code(int synd_status, ushort_t synd);
+
+extern uint_t read_all_memscrub;
+extern void memscrub_run(void);
+
+static uchar_t	isus2i;			/* set if sabre */
+static uchar_t	isus2e;			/* set if hummingbird */
+
+/*
+ * Default ecache mask and shift settings for Spitfire.  If we detect a
+ * different CPU implementation, we will modify these values at boot time.
+ */
+static uint64_t cpu_ec_tag_mask		= S_ECTAG_MASK;
+static uint64_t cpu_ec_state_mask	= S_ECSTATE_MASK;
+static uint64_t cpu_ec_par_mask		= S_ECPAR_MASK;
+static int cpu_ec_par_shift		= S_ECPAR_SHIFT;
+static int cpu_ec_tag_shift		= S_ECTAG_SHIFT;
+static int cpu_ec_state_shift		= S_ECSTATE_SHIFT;
+static uchar_t cpu_ec_state_exl		= S_ECSTATE_EXL;
+static uchar_t cpu_ec_state_mod		= S_ECSTATE_MOD;
+static uchar_t cpu_ec_state_shr		= S_ECSTATE_SHR;
+static uchar_t cpu_ec_state_own		= S_ECSTATE_OWN;
+
+/*
+ * Default ecache state bits for Spitfire.  These individual bits indicate if
+ * the given line is in any of the valid or modified states, respectively.
+ * Again, we modify these at boot if we detect a different CPU.
+ */
+static uchar_t cpu_ec_state_valid	= S_ECSTATE_VALID;
+static uchar_t cpu_ec_state_dirty	= S_ECSTATE_DIRTY;
+static uchar_t cpu_ec_parity		= S_EC_PARITY;
+static uchar_t cpu_ec_state_parity	= S_ECSTATE_PARITY;
+
+/*
+ * This table is used to determine which bit(s) is(are) bad when an ECC
+ * error occurrs.  The array is indexed an 8-bit syndrome.  The entries
+ * of this array have the following semantics:
+ *
+ *      00-63   The number of the bad bit, when only one bit is bad.
+ *      64      ECC bit C0 is bad.
+ *      65      ECC bit C1 is bad.
+ *      66      ECC bit C2 is bad.
+ *      67      ECC bit C3 is bad.
+ *      68      ECC bit C4 is bad.
+ *      69      ECC bit C5 is bad.
+ *      70      ECC bit C6 is bad.
+ *      71      ECC bit C7 is bad.
+ *      72      Two bits are bad.
+ *      73      Three bits are bad.
+ *      74      Four bits are bad.
+ *      75      More than Four bits are bad.
+ *      76      NO bits are bad.
+ * Based on "Galaxy Memory Subsystem SPECIFICATION" rev 0.6, pg. 28.
+ */
+
+#define	C0	64
+#define	C1	65
+#define	C2	66
+#define	C3	67
+#define	C4	68
+#define	C5	69
+#define	C6	70
+#define	C7	71
+#define	M2	72
+#define	M3	73
+#define	M4	74
+#define	MX	75
+#define	NA	76
+
+#define	SYND_IS_SINGLE_BIT_DATA(synd_code)	((synd_code >= 0) && \
+						    (synd_code < C0))
+#define	SYND_IS_SINGLE_BIT_CHK(synd_code)	((synd_code >= C0) && \
+						    (synd_code <= C7))
+
+static char ecc_syndrome_tab[] =
+{
+	NA, C0, C1, M2, C2, M2, M2, M3, C3, M2, M2, M3, M2, M3, M3, M4,
+	C4, M2, M2, 32, M2, 57, MX, M2, M2, 37, 49, M2, 40, M2, M2, 44,
+	C5, M2, M2, 33, M2, 61,  4, M2, M2, MX, 53, M2, 45, M2, M2, 41,
+	M2,  0,  1, M2, 10, M2, M2, MX, 15, M2, M2, MX, M2, M3, M3, M2,
+	C6, M2, M2, 42, M2, 59, 39, M2, M2, MX, 51, M2, 34, M2, M2, 46,
+	M2, 25, 29, M2, 27, M4, M2, MX, 31, M2, M4, MX, M2, MX, MX, M2,
+	M2, MX, 36, M2,  7, M2, M2, 54, MX, M2, M2, 62, M2, 48, 56, M2,
+	M3, M2, M2, MX, M2, MX, 22, M2, M2, 18, MX, M2, M3, M2, M2, MX,
+	C7, M2, M2, 47, M2, 63, MX, M2, M2,  6, 55, M2, 35, M2, M2, 43,
+	M2,  5, MX, M2, MX, M2, M2, 50, 38, M2, M2, 58, M2, 52, 60, M2,
+	M2, 17, 21, M2, 19, M4, M2, MX, 23, M2, M4, MX, M2, MX, MX, M2,
+	M3, M2, M2, MX, M2, MX, 30, M2, M2, 26, MX, M2, M3, M2, M2, MX,
+	M2,  8, 13, M2,  2, M2, M2, M3,  3, M2, M2, M3, M2, MX, MX, M2,
+	M3, M2, M2, M3, M2, MX, 16, M2, M2, 20, MX, M2, MX, M2, M2, MX,
+	M3, M2, M2, M3, M2, MX, 24, M2, M2, 28, MX, M2, MX, M2, M2, MX,
+	M4, 12,  9, M2, 14, M2, M2, MX, 11, M2, M2, MX, M2, MX, MX, M4
+};
+
+#define	SYND_TBL_SIZE 256
+
+/*
+ * Hack for determining UDBH/UDBL, for later cpu-specific error reporting.
+ * Cannot use bit 3 in afar, because it is a valid bit on a Sabre/Hummingbird.
+ */
+#define	UDBL_REG	0x8000
+#define	UDBL(synd)	((synd & UDBL_REG) >> 15)
+#define	SYND(synd)	(synd & 0x7FFF)
+
+/*
+ * These error types are specific to Spitfire and are used internally for the
+ * spitfire fault structure flt_type field.
+ */
+#define	CPU_UE_ERR		0	/* uncorrectable errors - UEs */
+#define	CPU_EDP_LDP_ERR		1	/* LDP or EDP parity error */
+#define	CPU_WP_ERR		2	/* WP parity error */
+#define	CPU_BTO_BERR_ERR	3	/* bus timeout errors */
+#define	CPU_PANIC_CP_ERR	4	/* cp error from panic polling */
+#define	CPU_TRAPPING_CP_ERR	5	/* for sabre/hbird only, cp error */
+#define	CPU_BADLINE_CI_ERR	6	/* E$ clean_bad line when idle */
+#define	CPU_BADLINE_CB_ERR	7	/* E$ clean_bad line when busy */
+#define	CPU_BADLINE_DI_ERR	8	/* E$ dirty_bad line when idle */
+#define	CPU_BADLINE_DB_ERR	9	/* E$ dirty_bad line when busy */
+#define	CPU_ORPHAN_CP_ERR	10	/* Orphan CP error */
+#define	CPU_ECACHE_ADDR_PAR_ERR	11	/* Ecache Address parity error */
+#define	CPU_ECACHE_STATE_ERR	12	/* Ecache state error */
+#define	CPU_ECACHE_ETP_ETS_ERR	13	/* ETP set but ETS is zero */
+#define	CPU_ECACHE_TAG_ERR	14	/* Scrub the E$ tag, if state clean */
+#define	CPU_ADDITIONAL_ERR	15	/* Additional errors occurred */
+
+/*
+ * Macro to access the "Spitfire cpu private" data structure.
+ */
+#define	CPU_PRIVATE_PTR(cp, x)	(&(((spitfire_private_t *)CPU_PRIVATE(cp))->x))
+
+/*
+ * set to 0 to disable automatic retiring of pages on
+ * DIMMs that have excessive soft errors
+ */
+int automatic_page_removal = 1;
+
+/*
+ * Heuristic for figuring out which module to replace.
+ * Relative likelihood that this P_SYND indicates that this module is bad.
+ * We call it a "score", though, not a relative likelihood.
+ *
+ * Step 1.
+ * Assign a score to each byte of P_SYND according to the following rules:
+ * If no bits on (0x00) or all bits on (0xFF), then give it a 5.
+ * If one bit on, give it a 95.
+ * If seven bits on, give it a 10.
+ * If two bits on:
+ *   in different nybbles, a 90
+ *   in same nybble, but unaligned, 85
+ *   in same nybble and as an aligned pair, 80
+ * If six bits on, look at the bits that are off:
+ *   in same nybble and as an aligned pair, 15
+ *   in same nybble, but unaligned, 20
+ *   in different nybbles, a 25
+ * If three bits on:
+ *   in diferent nybbles, no aligned pairs, 75
+ *   in diferent nybbles, one aligned pair, 70
+ *   in the same nybble, 65
+ * If five bits on, look at the bits that are off:
+ *   in the same nybble, 30
+ *   in diferent nybbles, one aligned pair, 35
+ *   in diferent nybbles, no aligned pairs, 40
+ * If four bits on:
+ *   all in one nybble, 45
+ *   as two aligned pairs, 50
+ *   one aligned pair, 55
+ *   no aligned pairs, 60
+ *
+ * Step 2:
+ * Take the higher of the two scores (one for each byte) as the score
+ * for the module.
+ *
+ * Print the score for each module, and field service should replace the
+ * module with the highest score.
+ */
+
+/*
+ * In the table below, the first row/column comment indicates the
+ * number of bits on in that nybble; the second row/column comment is
+ * the hex digit.
+ */
+
+static int
+p_synd_score_table[256] = {
+	/* 0   1   1   2   1   2   2   3   1   2   2   3   2   3   3   4 */
+	/* 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  A,  B,  C,  D,  E,  F */
+/* 0 0 */  5, 95, 95, 80, 95, 85, 85, 65, 95, 85, 85, 65, 80, 65, 65, 45,
+/* 1 1 */ 95, 90, 90, 70, 90, 75, 75, 55, 90, 75, 75, 55, 70, 55, 55, 30,
+/* 1 2 */ 95, 90, 90, 70, 90, 75, 75, 55, 90, 75, 75, 55, 70, 55, 55, 30,
+/* 2 3 */ 80, 70, 70, 50, 70, 55, 55, 35, 70, 55, 55, 35, 50, 35, 35, 15,
+/* 1 4 */ 95, 90, 90, 70, 90, 75, 75, 55, 90, 75, 75, 55, 70, 55, 55, 30,
+/* 2 5 */ 85, 75, 75, 55, 75, 60, 60, 40, 75, 60, 60, 40, 55, 40, 40, 20,
+/* 2 6 */ 85, 75, 75, 55, 75, 60, 60, 40, 75, 60, 60, 40, 55, 40, 40, 20,
+/* 3 7 */ 65, 55, 55, 35, 55, 40, 40, 25, 55, 40, 40, 25, 35, 25, 25, 10,
+/* 1 8 */ 95, 90, 90, 70, 90, 75, 75, 55, 90, 75, 75, 55, 70, 55, 55, 30,
+/* 2 9 */ 85, 75, 75, 55, 75, 60, 60, 40, 75, 60, 60, 40, 55, 40, 40, 20,
+/* 2 A */ 85, 75, 75, 55, 75, 60, 60, 40, 75, 60, 60, 40, 55, 40, 40, 20,
+/* 3 B */ 65, 55, 55, 35, 55, 40, 40, 25, 55, 40, 40, 25, 35, 25, 25, 10,
+/* 2 C */ 80, 70, 70, 50, 70, 55, 55, 35, 70, 55, 55, 35, 50, 35, 35, 15,
+/* 3 D */ 65, 55, 55, 35, 55, 40, 40, 25, 55, 40, 40, 25, 35, 25, 25, 10,
+/* 3 E */ 65, 55, 55, 35, 55, 40, 40, 25, 55, 40, 40, 25, 35, 25, 25, 10,
+/* 4 F */ 45, 30, 30, 15, 30, 20, 20, 10, 30, 20, 20, 10, 15, 10, 10,  5,
+};
+
+int
+ecc_psynd_score(ushort_t p_synd)
+{
+	int i, j, a, b;
+
+	i = p_synd & 0xFF;
+	j = (p_synd >> 8) & 0xFF;
+
+	a = p_synd_score_table[i];
+	b = p_synd_score_table[j];
+
+	return (a > b ? a : b);
+}
+
+/*
+ * Async Fault Logging
+ *
+ * To ease identifying, reading, and filtering async fault log messages, the
+ * label [AFT#] is now prepended to each async fault message.  These messages
+ * and the logging rules are implemented by cpu_aflt_log(), below.
+ *
+ * [AFT0] - Tag for log messages that are associated with corrected ECC errors.
+ *          This includes both corrected ECC memory and ecache faults.
+ *
+ * [AFT1] - Tag for log messages that are not ECC corrected (i.e. everything
+ *          else except CE errors) with a priority of 1 (highest).  This tag
+ *          is also used for panic messages that result from an async fault.
+ *
+ * [AFT2] - These are lower priority diagnostic messages for uncorrected ECC
+ * [AFT3]   or parity errors.  For example, AFT2 is used for the actual dump
+ *          of the E-$ data and tags.
+ *
+ * In a non-DEBUG kernel, AFT > 1 logs will be sent to the system log but not
+ * printed on the console.  To send all AFT logs to both the log and the
+ * console, set aft_verbose = 1.
+ */
+
+#define	CPU_FLTCPU		0x0001	/* print flt_inst as a CPU id */
+#define	CPU_SPACE		0x0002	/* print flt_status (data or instr) */
+#define	CPU_ERRID		0x0004	/* print flt_id */
+#define	CPU_TL			0x0008	/* print flt_tl */
+#define	CPU_ERRID_FIRST 	0x0010	/* print flt_id first in message */
+#define	CPU_AFSR		0x0020	/* print flt_stat as decoded %afsr */
+#define	CPU_AFAR		0x0040	/* print flt_addr as %afar */
+#define	CPU_AF_PSYND		0x0080	/* print flt_stat %afsr.PSYND */
+#define	CPU_AF_ETS		0x0100	/* print flt_stat %afsr.ETS */
+#define	CPU_UDBH		0x0200	/* print flt_sdbh and syndrome */
+#define	CPU_UDBL		0x0400	/* print flt_sdbl and syndrome */
+#define	CPU_FAULTPC		0x0800	/* print flt_pc */
+#define	CPU_SYND		0x1000	/* print flt_synd and unum */
+
+#define	CMN_LFLAGS	(CPU_FLTCPU | CPU_SPACE | CPU_ERRID | CPU_TL |	\
+				CPU_AFSR | CPU_AFAR | CPU_AF_PSYND |	\
+				CPU_AF_ETS | CPU_UDBH | CPU_UDBL |	\
+				CPU_FAULTPC)
+#define	UE_LFLAGS	(CMN_LFLAGS | CPU_SYND)
+#define	CE_LFLAGS	(UE_LFLAGS & ~CPU_UDBH & ~CPU_UDBL & ~CPU_TL &	\
+				~CPU_SPACE)
+#define	PARERR_LFLAGS	(CMN_LFLAGS)
+#define	WP_LFLAGS	(CMN_LFLAGS & ~CPU_SPACE & ~CPU_TL)
+#define	CP_LFLAGS	(CMN_LFLAGS & ~CPU_SPACE & ~CPU_TL &		\
+				~CPU_FLTCPU & ~CPU_FAULTPC)
+#define	BERRTO_LFLAGS	(CMN_LFLAGS)
+#define	NO_LFLAGS	(0)
+
+#define	AFSR_FMTSTR0	"\020\1ME"
+#define	AFSR_FMTSTR1	"\020\040PRIV\037ISAP\036ETP\035IVUE\034TO"	\
+			"\033BERR\032LDP\031CP\030WP\027EDP\026UE\025CE"
+#define	UDB_FMTSTR	"\020\012UE\011CE"
+
+/*
+ * Maximum number of contexts for Spitfire.
+ */
+#define	MAX_NCTXS	(1 << 13)
+
+/*
+ * Save the cache bootup state for use when internal
+ * caches are to be re-enabled after an error occurs.
+ */
+uint64_t	cache_boot_state = 0;
+
+/*
+ * PA[31:0] represent Displacement in UPA configuration space.
+ */
+uint_t	root_phys_addr_lo_mask = 0xffffffff;
+
+/*
+ * Spitfire legacy globals
+ */
+int	itlb_entries;
+int	dtlb_entries;
+
+void
+cpu_setup(void)
+{
+	extern int page_retire_messages;
+	extern int at_flags;
+#if defined(SF_ERRATA_57)
+	extern caddr_t errata57_limit;
+#endif
+	extern int disable_text_largepages;
+	extern int disable_initdata_largepages;
+
+	cache |= (CACHE_VAC | CACHE_PTAG | CACHE_IOCOHERENT);
+
+	at_flags = EF_SPARC_32PLUS | EF_SPARC_SUN_US1;
+
+	/*
+	 * Spitfire isn't currently FMA-aware, so we have to enable the
+	 * page retirement messages.
+	 */
+	page_retire_messages = 1;
+
+	/*
+	 * save the cache bootup state.
+	 */
+	cache_boot_state = get_lsu() & (LSU_IC | LSU_DC);
+
+	/*
+	 * Use the maximum number of contexts available for Spitfire unless
+	 * it has been tuned for debugging.
+	 * We are checking against 0 here since this value can be patched
+	 * while booting.  It can not be patched via /etc/system since it
+	 * will be patched too late and thus cause the system to panic.
+	 */
+	if (nctxs == 0)
+		nctxs = MAX_NCTXS;
+
+	if (use_page_coloring) {
+		do_pg_coloring = 1;
+		if (use_virtual_coloring)
+			do_virtual_coloring = 1;
+	}
+
+	/*
+	 * Tune pp_slots to use up to 1/8th of the tlb entries.
+	 */
+	pp_slots = MIN(8, MAXPP_SLOTS);
+
+	/*
+	 * Block stores invalidate all pages of the d$ so pagecopy
+	 * et. al. do not need virtual translations with virtual
+	 * coloring taken into consideration.
+	 */
+	pp_consistent_coloring = 0;
+
+	isa_list =
+	    "sparcv9+vis sparcv9 "
+	    "sparcv8plus+vis sparcv8plus "
+	    "sparcv8 sparcv8-fsmuld sparcv7 sparc";
+
+	cpu_hwcap_flags = AV_SPARC_VIS;
+
+	/*
+	 * On Spitfire, there's a hole in the address space
+	 * that we must never map (the hardware only support 44-bits of
+	 * virtual address).  Later CPUs are expected to have wider
+	 * supported address ranges.
+	 *
+	 * See address map on p23 of the UltraSPARC 1 user's manual.
+	 */
+	hole_start = (caddr_t)0x80000000000ull;
+	hole_end = (caddr_t)0xfffff80000000000ull;
+
+	/*
+	 * A spitfire call bug requires us to be a further 4Gbytes of
+	 * firewall from the spec.
+	 *
+	 * See Spitfire Errata #21
+	 */
+	hole_start = (caddr_t)((uintptr_t)hole_start - (1ul << 32));
+	hole_end = (caddr_t)((uintptr_t)hole_end + (1ul << 32));
+
+	/*
+	 * The kpm mapping window.
+	 * kpm_size:
+	 *	The size of a single kpm range.
+	 *	The overall size will be: kpm_size * vac_colors.
+	 * kpm_vbase:
+	 *	The virtual start address of the kpm range within the kernel
+	 *	virtual address space. kpm_vbase has to be kpm_size aligned.
+	 */
+	kpm_size = (size_t)(2ull * 1024 * 1024 * 1024 * 1024); /* 2TB */
+	kpm_size_shift = 41;
+	kpm_vbase = (caddr_t)0xfffffa0000000000ull; /* 16EB - 6TB */
+
+#if defined(SF_ERRATA_57)
+	errata57_limit = (caddr_t)0x80000000ul;
+#endif
+
+	/*
+	 * Allow only 8K, 64K and 4M pages for text by default.
+	 * Allow only 8K and 64K page for initialized data segments by
+	 * default.
+	 */
+	disable_text_largepages = (1 << TTE512K) | (1 << TTE32M) |
+	    (1 << TTE256M);
+	disable_initdata_largepages = (1 << TTE512K) | (1 << TTE4M) |
+	    (1 << TTE32M) | (1 << TTE256M);
+}
+
+static int
+getintprop(dnode_t node, char *name, int deflt)
+{
+	int	value;
+
+	switch (prom_getproplen(node, name)) {
+	case 0:
+		value = 1;	/* boolean properties */
+		break;
+
+	case sizeof (int):
+		(void) prom_getprop(node, name, (caddr_t)&value);
+		break;
+
+	default:
+		value = deflt;
+		break;
+	}
+
+	return (value);
+}
+
+/*
+ * Set the magic constants of the implementation.
+ */
+void
+cpu_fiximp(dnode_t dnode)
+{
+	extern int vac_size, vac_shift;
+	extern uint_t vac_mask;
+	extern int dcache_line_mask;
+	int i, a;
+	static struct {
+		char	*name;
+		int	*var;
+	} prop[] = {
+		"dcache-size",		&dcache_size,
+		"dcache-line-size",	&dcache_linesize,
+		"icache-size",		&icache_size,
+		"icache-line-size",	&icache_linesize,
+		"ecache-size",		&ecache_size,
+		"ecache-line-size",	&ecache_alignsize,
+		"ecache-associativity", &ecache_associativity,
+		"#itlb-entries",	&itlb_entries,
+		"#dtlb-entries",	&dtlb_entries,
+		};
+
+	for (i = 0; i < sizeof (prop) / sizeof (prop[0]); i++) {
+		if ((a = getintprop(dnode, prop[i].name, -1)) != -1) {
+			*prop[i].var = a;
+		}
+	}
+
+	ecache_setsize = ecache_size / ecache_associativity;
+
+	vac_size = S_VAC_SIZE;
+	vac_mask = MMU_PAGEMASK & (vac_size - 1);
+	i = 0; a = vac_size;
+	while (a >>= 1)
+		++i;
+	vac_shift = i;
+	shm_alignment = vac_size;
+	vac = 1;
+
+	dcache_line_mask = (dcache_size - 1) & ~(dcache_linesize - 1);
+
+	/*
+	 * UltraSPARC I & II have ecache sizes running
+	 * as follows: .25 MB, .5 MB, 1 MB, 2 MB, 4 MB
+	 * and 8 MB. Adjust the copyin/copyout limits
+	 * according to the cache size. The magic number
+	 * of VIS_COPY_THRESHOLD comes from the copyin/copyout code
+	 * and its floor of VIS_COPY_THRESHOLD bytes before it will use
+	 * VIS instructions.
+	 *
+	 * We assume that all CPUs on the system have the same size
+	 * ecache. We're also called very early in the game.
+	 * /etc/system will be parsed *after* we're called so
+	 * these values can be overwritten.
+	 */
+
+	hw_copy_limit_1 = VIS_COPY_THRESHOLD;
+	if (ecache_size <= 524288) {
+		hw_copy_limit_2 = VIS_COPY_THRESHOLD;
+		hw_copy_limit_4 = VIS_COPY_THRESHOLD;
+		hw_copy_limit_8 = VIS_COPY_THRESHOLD;
+	} else if (ecache_size == 1048576) {
+		hw_copy_limit_2 = 1024;
+		hw_copy_limit_4 = 1280;
+		hw_copy_limit_8 = 1536;
+	} else if (ecache_size == 2097152) {
+		hw_copy_limit_2 = 1536;
+		hw_copy_limit_4 = 2048;
+		hw_copy_limit_8 = 2560;
+	} else if (ecache_size == 4194304) {
+		hw_copy_limit_2 = 2048;
+		hw_copy_limit_4 = 2560;
+		hw_copy_limit_8 = 3072;
+	} else {
+		hw_copy_limit_2 = 2560;
+		hw_copy_limit_4 = 3072;
+		hw_copy_limit_8 = 3584;
+	}
+}
+
+/*
+ * Called by setcpudelay
+ */
+void
+cpu_init_tick_freq(void)
+{
+	/*
+	 * Determine the cpu frequency by calling
+	 * tod_get_cpufrequency. Use an approximate freqency
+	 * value computed by the prom if the tod module
+	 * is not initialized and loaded yet.
+	 */
+	if (tod_ops.tod_get_cpufrequency != NULL) {
+		mutex_enter(&tod_lock);
+		sys_tick_freq = tod_ops.tod_get_cpufrequency();
+		mutex_exit(&tod_lock);
+	} else {
+#if defined(HUMMINGBIRD)
+		/*
+		 * the hummingbird version of %stick is used as the basis for
+		 * low level timing; this provides an independent constant-rate
+		 * clock for general system use, and frees power mgmt to set
+		 * various cpu clock speeds.
+		 */
+		if (system_clock_freq == 0)
+			cmn_err(CE_PANIC, "invalid system_clock_freq 0x%lx",
+			    system_clock_freq);
+		sys_tick_freq = system_clock_freq;
+#else /* SPITFIRE */
+		sys_tick_freq = cpunodes[CPU->cpu_id].clock_freq;
+#endif
+	}
+}
+
+
+void shipit(int upaid);
+extern uint64_t xc_tick_limit;
+extern uint64_t xc_tick_jump_limit;
+
+#ifdef SEND_MONDO_STATS
+uint64_t x_early[NCPU][64];
+#endif
+
+/*
+ * Note: A version of this function is used by the debugger via the KDI,
+ * and must be kept in sync with this version.  Any changes made to this
+ * function to support new chips or to accomodate errata must also be included
+ * in the KDI-specific version.  See spitfire_kdi.c.
+ */
+void
+send_one_mondo(int cpuid)
+{
+	uint64_t idsr, starttick, endtick;
+	int upaid, busy, nack;
+	uint64_t tick, tick_prev;
+	ulong_t ticks;
+
+	CPU_STATS_ADDQ(CPU, sys, xcalls, 1);
+	upaid = CPUID_TO_UPAID(cpuid);
+	tick = starttick = gettick();
+	shipit(upaid);
+	endtick = starttick + xc_tick_limit;
+	busy = nack = 0;
+	for (;;) {
+		idsr = getidsr();
+		if (idsr == 0)
+			break;
+		/*
+		 * When we detect an irregular tick jump, we adjust
+		 * the timer window to the current tick value.
+		 */
+		tick_prev = tick;
+		tick = gettick();
+		ticks = tick - tick_prev;
+		if (ticks > xc_tick_jump_limit) {
+			endtick = tick + xc_tick_limit;
+		} else if (tick > endtick) {
+			if (panic_quiesce)
+				return;
+			cmn_err(CE_PANIC,
+			"send mondo timeout (target 0x%x) [%d NACK %d BUSY]",
+			upaid, nack, busy);
+		}
+		if (idsr & IDSR_BUSY) {
+			busy++;
+			continue;
+		}
+		drv_usecwait(1);
+		shipit(upaid);
+		nack++;
+		busy = 0;
+	}
+#ifdef SEND_MONDO_STATS
+	x_early[getprocessorid()][highbit(gettick() - starttick) - 1]++;
+#endif
+}
+
+void
+send_mondo_set(cpuset_t set)
+{
+	int i;
+
+	for (i = 0; i < NCPU; i++)
+		if (CPU_IN_SET(set, i)) {
+			send_one_mondo(i);
+			CPUSET_DEL(set, i);
+			if (CPUSET_ISNULL(set))
+				break;
+		}
+}
+
+void
+syncfpu(void)
+{
+}
+
+/*
+ * Determine the size of the CPU module's error structure in bytes.  This is
+ * called once during boot to initialize the error queues.
+ */
+int
+cpu_aflt_size(void)
+{
+	/*
+	 * We need to determine whether this is a sabre, Hummingbird or a
+	 * Spitfire/Blackbird impl and set the appropriate state variables for
+	 * ecache tag manipulation.  We can't do this in cpu_setup() as it is
+	 * too early in the boot flow and the cpunodes are not initialized.
+	 * This routine will be called once after cpunodes[] is ready, so do
+	 * it here.
+	 */
+	if (cpunodes[CPU->cpu_id].implementation == SABRE_IMPL) {
+		isus2i = 1;
+		cpu_ec_tag_mask = SB_ECTAG_MASK;
+		cpu_ec_state_mask = SB_ECSTATE_MASK;
+		cpu_ec_par_mask = SB_ECPAR_MASK;
+		cpu_ec_par_shift = SB_ECPAR_SHIFT;
+		cpu_ec_tag_shift = SB_ECTAG_SHIFT;
+		cpu_ec_state_shift = SB_ECSTATE_SHIFT;
+		cpu_ec_state_exl = SB_ECSTATE_EXL;
+		cpu_ec_state_mod = SB_ECSTATE_MOD;
+
+		/* These states do not exist in sabre - set to 0xFF */
+		cpu_ec_state_shr = 0xFF;
+		cpu_ec_state_own = 0xFF;
+
+		cpu_ec_state_valid = SB_ECSTATE_VALID;
+		cpu_ec_state_dirty = SB_ECSTATE_DIRTY;
+		cpu_ec_state_parity = SB_ECSTATE_PARITY;
+		cpu_ec_parity = SB_EC_PARITY;
+	} else if (cpunodes[CPU->cpu_id].implementation == HUMMBRD_IMPL) {
+		isus2e = 1;
+		cpu_ec_tag_mask = HB_ECTAG_MASK;
+		cpu_ec_state_mask = HB_ECSTATE_MASK;
+		cpu_ec_par_mask = HB_ECPAR_MASK;
+		cpu_ec_par_shift = HB_ECPAR_SHIFT;
+		cpu_ec_tag_shift = HB_ECTAG_SHIFT;
+		cpu_ec_state_shift = HB_ECSTATE_SHIFT;
+		cpu_ec_state_exl = HB_ECSTATE_EXL;
+		cpu_ec_state_mod = HB_ECSTATE_MOD;
+
+		/* These states do not exist in hummingbird - set to 0xFF */
+		cpu_ec_state_shr = 0xFF;
+		cpu_ec_state_own = 0xFF;
+
+		cpu_ec_state_valid = HB_ECSTATE_VALID;
+		cpu_ec_state_dirty = HB_ECSTATE_DIRTY;
+		cpu_ec_state_parity = HB_ECSTATE_PARITY;
+		cpu_ec_parity = HB_EC_PARITY;
+	}
+
+	return (sizeof (spitf_async_flt));
+}
+
+
+/*
+ * Correctable ecc error trap handler
+ */
+/*ARGSUSED*/
+void
+cpu_ce_error(struct regs *rp, ulong_t p_afar, ulong_t p_afsr,
+	uint_t p_afsr_high, uint_t p_afar_high)
+{
+	ushort_t sdbh, sdbl;
+	ushort_t e_syndh, e_syndl;
+	spitf_async_flt spf_flt;
+	struct async_flt *ecc;
+	int queue = 1;
+
+	uint64_t t_afar = p_afar;
+	uint64_t t_afsr = p_afsr;
+
+	/*
+	 * Note: the Spitfire data buffer error registers
+	 * (upper and lower halves) are or'ed into the upper
+	 * word of the afsr by ce_err().
+	 */
+	sdbh = (ushort_t)((t_afsr >> 33) & 0x3FF);
+	sdbl = (ushort_t)((t_afsr >> 43) & 0x3FF);
+
+	e_syndh = (uchar_t)(sdbh & (uint_t)P_DER_E_SYND);
+	e_syndl = (uchar_t)(sdbl & (uint_t)P_DER_E_SYND);
+
+	t_afsr &= S_AFSR_MASK;
+	t_afar &= SABRE_AFAR_PA;	/* must use Sabre AFAR mask */
+
+	/* Setup the async fault structure */
+	bzero(&spf_flt, sizeof (spitf_async_flt));
+	ecc = (struct async_flt *)&spf_flt;
+	ecc->flt_id = gethrtime_waitfree();
+	ecc->flt_stat = t_afsr;
+	ecc->flt_addr = t_afar;
+	ecc->flt_status = ECC_C_TRAP;
+	ecc->flt_bus_id = getprocessorid();
+	ecc->flt_inst = CPU->cpu_id;
+	ecc->flt_pc = (caddr_t)rp->r_pc;
+	ecc->flt_func = log_ce_err;
+	ecc->flt_in_memory =
+		(pf_is_memory(ecc->flt_addr >> MMU_PAGESHIFT)) ? 1: 0;
+	spf_flt.flt_sdbh = sdbh;
+	spf_flt.flt_sdbl = sdbl;
+
+	/*
+	 * Check for fatal conditions.
+	 */
+	check_misc_err(&spf_flt);
+
+	/*
+	 * Pananoid checks for valid AFSR and UDBs
+	 */
+	if ((t_afsr & P_AFSR_CE) == 0) {
+		cpu_aflt_log(CE_PANIC, 1, &spf_flt, CMN_LFLAGS,
+			"** Panic due to CE bit not set in the AFSR",
+			"  Corrected Memory Error on");
+	}
+
+	/*
+	 * We want to skip logging only if ALL the following
+	 * conditions are true:
+	 *
+	 *	1. There is only one error
+	 *	2. That error is a correctable memory error
+	 *	3. The error is caused by the memory scrubber (in which case
+	 *	    the error will have occurred under on_trap protection)
+	 *	4. The error is on a retired page
+	 *
+	 * Note: OT_DATA_EC is used places other than the memory scrubber.
+	 * However, none of those errors should occur on a retired page.
+	 */
+	if ((ecc->flt_stat & (S_AFSR_ALL_ERRS & ~P_AFSR_ME)) == P_AFSR_CE &&
+	    curthread->t_ontrap != NULL) {
+
+		if (curthread->t_ontrap->ot_prot & OT_DATA_EC) {
+			page_t *pp = page_numtopp_nolock((pfn_t)
+			    (ecc->flt_addr >> MMU_PAGESHIFT));
+
+			if (pp != NULL && page_isretired(pp)) {
+				queue = 0;
+			}
+		}
+	}
+
+	if (((sdbh & P_DER_CE) == 0) && ((sdbl & P_DER_CE) == 0)) {
+		cpu_aflt_log(CE_PANIC, 1, &spf_flt, CMN_LFLAGS,
+			"** Panic due to CE bits not set in the UDBs",
+			" Corrected Memory Error on");
+	}
+
+	if ((sdbh >> 8) & 1) {
+		ecc->flt_synd = e_syndh;
+		ce_scrub(ecc);
+		if (queue) {
+			cpu_errorq_dispatch(FM_EREPORT_CPU_USII_CE, ecc,
+			    sizeof (*ecc), ce_queue, ERRORQ_ASYNC);
+		}
+	}
+
+	if ((sdbl >> 8) & 1) {
+		ecc->flt_addr = t_afar | 0x8;	/* Sabres do not have a UDBL */
+		ecc->flt_synd = e_syndl | UDBL_REG;
+		ce_scrub(ecc);
+		if (queue) {
+			cpu_errorq_dispatch(FM_EREPORT_CPU_USII_CE, ecc,
+			    sizeof (*ecc), ce_queue, ERRORQ_ASYNC);
+		}
+	}
+
+	/*
+	 * Re-enable all error trapping (CEEN currently cleared).
+	 */
+	clr_datapath();
+	set_asyncflt(P_AFSR_CE);
+	set_error_enable(EER_ENABLE);
+}
+
+/*
+ * Cpu specific CE logging routine
+ */
+static void
+log_ce_err(struct async_flt *aflt, char *unum)
+{
+	spitf_async_flt spf_flt;
+
+	if ((aflt->flt_stat & P_AFSR_CE) && (ce_verbose_memory == 0)) {
+		return;
+	}
+
+	spf_flt.cmn_asyncflt = *aflt;
+	cpu_aflt_log(CE_CONT, 0, &spf_flt, CE_LFLAGS, unum,
+	    " Corrected Memory Error detected by");
+}
+
+/*
+ * Spitfire does not perform any further CE classification refinement
+ */
+/*ARGSUSED*/
+int
+ce_scrub_xdiag_recirc(struct async_flt *ecc, errorq_t *eqp, errorq_elem_t *eqep,
+    size_t afltoffset)
+{
+	return (0);
+}
+
+char *
+flt_to_error_type(struct async_flt *aflt)
+{
+	if (aflt->flt_status & ECC_INTERMITTENT)
+		return (ERR_TYPE_DESC_INTERMITTENT);
+	if (aflt->flt_status & ECC_PERSISTENT)
+		return (ERR_TYPE_DESC_PERSISTENT);
+	if (aflt->flt_status & ECC_STICKY)
+		return (ERR_TYPE_DESC_STICKY);
+	return (ERR_TYPE_DESC_UNKNOWN);
+}
+
+/*
+ * Called by correctable ecc error logging code to print out
+ * the stick/persistent/intermittent status of the error.
+ */
+static void
+cpu_ce_log_status(spitf_async_flt *spf_flt, char *unum)
+{
+	ushort_t status;
+	char *status1_str = "Memory";
+	char *status2_str = "Intermittent";
+	struct async_flt *aflt = (struct async_flt *)spf_flt;
+
+	status = aflt->flt_status;
+
+	if (status & ECC_ECACHE)
+		status1_str = "Ecache";
+
+	if (status & ECC_STICKY)
+		status2_str = "Sticky";
+	else if (status & ECC_PERSISTENT)
+		status2_str = "Persistent";
+
+	cpu_aflt_log(CE_CONT, 0, spf_flt, CPU_ERRID_FIRST,
+		NULL, " Corrected %s Error on %s is %s",
+		status1_str, unum, status2_str);
+}
+
+/*
+ * check for a valid ce syndrome, then call the
+ * displacement flush scrubbing code, and then check the afsr to see if
+ * the error was persistent or intermittent. Reread the afar/afsr to see
+ * if the error was not scrubbed successfully, and is therefore sticky.
+ */
+/*ARGSUSED1*/
+void
+cpu_ce_scrub_mem_err(struct async_flt *ecc, boolean_t triedcpulogout)
+{
+	uint64_t eer, afsr;
+	ushort_t status;
+
+	ASSERT(getpil() > LOCK_LEVEL);
+
+	/*
+	 * It is possible that the flt_addr is not a valid
+	 * physical address. To deal with this, we disable
+	 * NCEEN while we scrub that address. If this causes
+	 * a TIMEOUT/BERR, we know this is an invalid
+	 * memory location.
+	 */
+	kpreempt_disable();
+	eer = get_error_enable();
+	if (eer & (EER_CEEN | EER_NCEEN))
+	    set_error_enable(eer & ~(EER_CEEN | EER_NCEEN));
+
+	/*
+	 * To check if the error detected by IO is persistent, sticky or
+	 * intermittent.
+	 */
+	if (ecc->flt_status & ECC_IOBUS) {
+		ecc->flt_stat = P_AFSR_CE;
+	}
+
+	scrubphys(P2ALIGN(ecc->flt_addr, 64),
+	    cpunodes[CPU->cpu_id].ecache_size);
+
+	get_asyncflt(&afsr);
+	if (afsr & (P_AFSR_TO | P_AFSR_BERR)) {
+		/*
+		 * Must ensure that we don't get the TIMEOUT/BERR
+		 * when we reenable NCEEN, so we clear the AFSR.
+		 */
+		set_asyncflt(afsr & (P_AFSR_TO | P_AFSR_BERR));
+		if (eer & (EER_CEEN | EER_NCEEN))
+		    set_error_enable(eer);
+		kpreempt_enable();
+		return;
+	}
+
+	if (eer & EER_NCEEN)
+	    set_error_enable(eer & ~EER_CEEN);
+
+	/*
+	 * Check and clear any ECC errors from the scrub.  If the scrub did
+	 * not trip over the error, mark it intermittent.  If the scrub did
+	 * trip the error again and it did not scrub away, mark it sticky.
+	 * Otherwise mark it persistent.
+	 */
+	if (check_ecc(ecc) != 0) {
+		cpu_read_paddr(ecc, 0, 1);
+
+		if (check_ecc(ecc) != 0)
+			status = ECC_STICKY;
+		else
+			status = ECC_PERSISTENT;
+	} else
+		status = ECC_INTERMITTENT;
+
+	if (eer & (EER_CEEN | EER_NCEEN))
+	    set_error_enable(eer);
+	kpreempt_enable();
+
+	ecc->flt_status &= ~(ECC_INTERMITTENT | ECC_PERSISTENT | ECC_STICKY);
+	ecc->flt_status |= status;
+}
+
+/*
+ * get the syndrome and unum, and then call the routines
+ * to check the other cpus and iobuses, and then do the error logging.
+ */
+/*ARGSUSED1*/
+void
+cpu_ce_log_err(struct async_flt *ecc, errorq_elem_t *eqep)
+{
+	char unum[UNUM_NAMLEN];
+	int len = 0;
+	int ce_verbose = 0;
+
+	ASSERT(ecc->flt_func != NULL);
+
+	/* Get the unum string for logging purposes */
+	(void) cpu_get_mem_unum_aflt(AFLT_STAT_VALID, ecc, unum,
+	    UNUM_NAMLEN, &len);
+
+	/* Call specific error logging routine */
+	(void) (*ecc->flt_func)(ecc, unum);
+
+	/*
+	 * Count errors per unum.
+	 * Non-memory errors are all counted via a special unum string.
+	 */
+	if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING &&
+	    automatic_page_removal) {
+		page_t *pp = page_numtopp_nolock((pfn_t)
+		    (ecc->flt_addr >> MMU_PAGESHIFT));
+
+		if (pp) {
+			page_settoxic(pp, PAGE_IS_FAULTY);
+			(void) page_retire(pp, PAGE_IS_FAILING);
+		}
+	}
+
+	if (ecc->flt_panic) {
+		ce_verbose = 1;
+	} else if ((ecc->flt_class == BUS_FAULT) ||
+	    (ecc->flt_stat & P_AFSR_CE)) {
+		ce_verbose = (ce_verbose_memory > 0);
+	} else {
+		ce_verbose = 1;
+	}
+
+	if (ce_verbose) {
+		spitf_async_flt sflt;
+		int synd_code;
+
+		sflt.cmn_asyncflt = *ecc;	/* for cpu_aflt_log() */
+
+		cpu_ce_log_status(&sflt, unum);
+
+		synd_code = synd_to_synd_code(AFLT_STAT_VALID,
+				SYND(ecc->flt_synd));
+
+		if (SYND_IS_SINGLE_BIT_DATA(synd_code)) {
+			cpu_aflt_log(CE_CONT, 0, &sflt, CPU_ERRID_FIRST,
+			    NULL, " ECC Data Bit %2d was in error "
+			    "and corrected", synd_code);
+		} else if (SYND_IS_SINGLE_BIT_CHK(synd_code)) {
+			cpu_aflt_log(CE_CONT, 0, &sflt, CPU_ERRID_FIRST,
+			    NULL, " ECC Check Bit %2d was in error "
+			    "and corrected", synd_code - C0);
+		} else {
+			/*
+			 * These are UE errors - we shouldn't be getting CE
+			 * traps for these; handle them in case of bad h/w.
+			 */
+			switch (synd_code) {
+			case M2:
+				cpu_aflt_log(CE_CONT, 0, &sflt,
+				    CPU_ERRID_FIRST, NULL,
+				    " Two ECC Bits were in error");
+				break;
+			case M3:
+				cpu_aflt_log(CE_CONT, 0, &sflt,
+				    CPU_ERRID_FIRST, NULL,
+				    " Three ECC Bits were in error");
+				break;
+			case M4:
+				cpu_aflt_log(CE_CONT, 0, &sflt,
+				    CPU_ERRID_FIRST, NULL,
+				    " Four ECC Bits were in error");
+				break;
+			case MX:
+				cpu_aflt_log(CE_CONT, 0, &sflt,
+				    CPU_ERRID_FIRST, NULL,
+				    " More than Four ECC bits were "
+				    "in error");
+				break;
+			default:
+				cpu_aflt_log(CE_CONT, 0, &sflt,
+				    CPU_ERRID_FIRST, NULL,
+				    " Unknown fault syndrome %d",
+				    synd_code);
+				break;
+			}
+		}
+	}
+
+	/* Display entire cache line, if valid address */
+	if (ce_show_data && ecc->flt_addr != AFLT_INV_ADDR)
+		read_ecc_data(ecc, 1, 1);
+}
+
+/*
+ * We route all errors through a single switch statement.
+ */
+void
+cpu_ue_log_err(struct async_flt *aflt)
+{
+
+	switch (aflt->flt_class) {
+	case CPU_FAULT:
+		cpu_async_log_err(aflt);
+		break;
+
+	case BUS_FAULT:
+		bus_async_log_err(aflt);
+		break;
+
+	default:
+		cmn_err(CE_WARN, "discarding async error 0x%p with invalid "
+		    "fault class (0x%x)", (void *)aflt, aflt->flt_class);
+		break;
+	}
+}
+
+/* Values for action variable in cpu_async_error() */
+#define	ACTION_NONE		0
+#define	ACTION_TRAMPOLINE	1
+#define	ACTION_AST_FLAGS	2
+
+/*
+ * Access error trap handler for asynchronous cpu errors.  This routine is
+ * called to handle a data or instruction access error.  All fatal errors are
+ * completely handled by this routine (by panicking).  Non fatal error logging
+ * is queued for later processing either via AST or softint at a lower PIL.
+ * In case of panic, the error log queue will also be processed as part of the
+ * panic flow to ensure all errors are logged.  This routine is called with all
+ * errors disabled at PIL15.  The AFSR bits are cleared and the UDBL and UDBH
+ * error bits are also cleared.  The hardware has also disabled the I and
+ * D-caches for us, so we must re-enable them before returning.
+ *
+ * A summary of the handling of tl=0 UE/LDP/EDP/TO/BERR/WP/CP:
+ *
+ *		_______________________________________________________________
+ *		|        Privileged tl0		|         Unprivileged	      |
+ *		| Protected	| Unprotected	| Protected	| Unprotected |
+ *		|on_trap|lofault|		|		|	      |
+ * -------------|-------|-------+---------------+---------------+-------------|
+ *		|	|	|		|		|	      |
+ * UE/LDP/EDP	| L,T,p	| L,R,p	| L,P		| n/a		| L,R,p	      |
+ *		|	|	|		|		|	      |
+ * TO/BERR	| T	| S	| L,P		| n/a		| S	      |
+ *		|	|	|		|		|	      |
+ * WP		| L,M,p | L,M,p	| L,M,p		| n/a		| L,M,p       |
+ *		|	|	|		|		|	      |
+ * CP (IIi/IIe)	| L,P	| L,P	| L,P		| n/a		| L,P	      |
+ * ____________________________________________________________________________
+ *
+ *
+ * Action codes:
+ *
+ * L - log
+ * M - kick off memscrubber if flt_in_memory
+ * P - panic
+ * p - panic if US-IIi or US-IIe (Sabre); overrides R and M
+ * R - i)  if aft_panic is set, panic
+ *     ii) otherwise, send hwerr event to contract and SIGKILL to process
+ * S - send SIGBUS to process
+ * T - trampoline
+ *
+ * Special cases:
+ *
+ * 1) if aft_testfatal is set, all faults result in a panic regardless
+ *    of type (even WP), protection (even on_trap), or privilege.
+ */
+/*ARGSUSED*/
+void
+cpu_async_error(struct regs *rp, ulong_t p_afar, ulong_t p_afsr,
+	uint_t p_afsr_high, uint_t p_afar_high)
+{
+	ushort_t sdbh, sdbl, ttype, tl;
+	spitf_async_flt spf_flt;
+	struct async_flt *aflt;
+	char pr_reason[28];
+	uint64_t oafsr;
+	uint64_t acc_afsr = 0;			/* accumulated afsr */
+	int action = ACTION_NONE;
+	uint64_t t_afar = p_afar;
+	uint64_t t_afsr = p_afsr;
+	int expected = DDI_FM_ERR_UNEXPECTED;
+	ddi_acc_hdl_t *hp;
+
+	/*
+	 * We need to look at p_flag to determine if the thread detected an
+	 * error while dumping core.  We can't grab p_lock here, but it's ok
+	 * because we just need a consistent snapshot and we know that everyone
+	 * else will store a consistent set of bits while holding p_lock.  We
+	 * don't have to worry about a race because SDOCORE is set once prior
+	 * to doing i/o from the process's address space and is never cleared.
+	 */
+	uint_t pflag = ttoproc(curthread)->p_flag;
+
+	pr_reason[0] = '\0';
+
+	/*
+	 * Note: the Spitfire data buffer error registers
+	 * (upper and lower halves) are or'ed into the upper
+	 * word of the afsr by async_err() if P_AFSR_UE is set.
+	 */
+	sdbh = (ushort_t)((t_afsr >> 33) & 0x3FF);
+	sdbl = (ushort_t)((t_afsr >> 43) & 0x3FF);
+
+	/*
+	 * Grab the ttype encoded in <63:53> of the saved
+	 * afsr passed from async_err()
+	 */
+	ttype = (ushort_t)((t_afsr >> 53) & 0x1FF);
+	tl = (ushort_t)(t_afsr >> 62);
+
+	t_afsr &= S_AFSR_MASK;
+	t_afar &= SABRE_AFAR_PA;	/* must use Sabre AFAR mask */
+
+	/*
+	 * Initialize most of the common and CPU-specific structure.  We derive
+	 * aflt->flt_priv from %tstate, instead of from the AFSR.PRIV bit.  The
+	 * initial setting of aflt->flt_panic is based on TL: we must panic if
+	 * the error occurred at TL > 0.  We also set flt_panic if the test/demo
+	 * tuneable aft_testfatal is set (not the default).
+	 */
+	bzero(&spf_flt, sizeof (spitf_async_flt));
+	aflt = (struct async_flt *)&spf_flt;
+	aflt->flt_id = gethrtime_waitfree();
+	aflt->flt_stat = t_afsr;
+	aflt->flt_addr = t_afar;
+	aflt->flt_bus_id = getprocessorid();
+	aflt->flt_inst = CPU->cpu_id;
+	aflt->flt_pc = (caddr_t)rp->r_pc;
+	aflt->flt_prot = AFLT_PROT_NONE;
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_priv = (rp->r_tstate & TSTATE_PRIV) ? 1 : 0;
+	aflt->flt_tl = (uchar_t)tl;
+	aflt->flt_panic = (tl != 0 || aft_testfatal != 0);
+	aflt->flt_core = (pflag & SDOCORE) ? 1 : 0;
+
+	/*
+	 * Set flt_status based on the trap type.  If we end up here as the
+	 * result of a UE detected by the CE handling code, leave status 0.
+	 */
+	switch (ttype) {
+	case T_DATA_ERROR:
+		aflt->flt_status = ECC_D_TRAP;
+		break;
+	case T_INSTR_ERROR:
+		aflt->flt_status = ECC_I_TRAP;
+		break;
+	}
+
+	spf_flt.flt_sdbh = sdbh;
+	spf_flt.flt_sdbl = sdbl;
+
+	/*
+	 * Check for fatal async errors.
+	 */
+	check_misc_err(&spf_flt);
+
+	/*
+	 * If the trap occurred in privileged mode at TL=0, we need to check to
+	 * see if we were executing in the kernel under on_trap() or t_lofault
+	 * protection.  If so, modify the saved registers so that we return
+	 * from the trap to the appropriate trampoline routine.
+	 */
+	if (aflt->flt_priv && tl == 0) {
+		if (curthread->t_ontrap != NULL) {
+			on_trap_data_t *otp = curthread->t_ontrap;
+
+			if (otp->ot_prot & OT_DATA_EC) {
+				aflt->flt_prot = AFLT_PROT_EC;
+				otp->ot_trap |= OT_DATA_EC;
+				rp->r_pc = otp->ot_trampoline;
+				rp->r_npc = rp->r_pc + 4;
+				action = ACTION_TRAMPOLINE;
+			}
+
+			if ((t_afsr & (P_AFSR_TO | P_AFSR_BERR)) &&
+			    (otp->ot_prot & OT_DATA_ACCESS)) {
+				aflt->flt_prot = AFLT_PROT_ACCESS;
+				otp->ot_trap |= OT_DATA_ACCESS;
+				rp->r_pc = otp->ot_trampoline;
+				rp->r_npc = rp->r_pc + 4;
+				action = ACTION_TRAMPOLINE;
+				/*
+				 * for peeks and caut_gets errors are expected
+				 */
+				hp = (ddi_acc_hdl_t *)otp->ot_handle;
+				if (!hp)
+					expected = DDI_FM_ERR_PEEK;
+				else if (hp->ah_acc.devacc_attr_access ==
+				    DDI_CAUTIOUS_ACC)
+					expected = DDI_FM_ERR_EXPECTED;
+			}
+
+		} else if (curthread->t_lofault) {
+			aflt->flt_prot = AFLT_PROT_COPY;
+			rp->r_g1 = EFAULT;
+			rp->r_pc = curthread->t_lofault;
+			rp->r_npc = rp->r_pc + 4;
+			action = ACTION_TRAMPOLINE;
+		}
+	}
+
+	/*
+	 * Determine if this error needs to be treated as fatal.  Note that
+	 * multiple errors detected upon entry to this trap handler does not
+	 * necessarily warrant a panic.  We only want to panic if the trap
+	 * happened in privileged mode and not under t_ontrap or t_lofault
+	 * protection.  The exception is WP: if we *only* get WP, it is not
+	 * fatal even if the trap occurred in privileged mode, except on Sabre.
+	 *
+	 * aft_panic, if set, effectively makes us treat usermode
+	 * UE/EDP/LDP faults as if they were privileged - so we we will
+	 * panic instead of sending a contract event.  A lofault-protected
+	 * fault will normally follow the contract event; if aft_panic is
+	 * set this will be changed to a panic.
+	 *
+	 * For usermode BERR/BTO errors, eg from processes performing device
+	 * control through mapped device memory, we need only deliver
+	 * a SIGBUS to the offending process.
+	 *
+	 * Some additional flt_panic reasons (eg, WP on Sabre) will be
+	 * checked later; for now we implement the common reasons.
+	 */
+	if (aflt->flt_prot == AFLT_PROT_NONE) {
+		/*
+		 * Beware - multiple bits may be set in AFSR
+		 */
+		if (t_afsr & (P_AFSR_UE | P_AFSR_LDP | P_AFSR_EDP)) {
+			if (aflt->flt_priv || aft_panic)
+				aflt->flt_panic = 1;
+		}
+
+		if (t_afsr & (P_AFSR_TO | P_AFSR_BERR)) {
+			if (aflt->flt_priv)
+				aflt->flt_panic = 1;
+		}
+	} else if (aflt->flt_prot == AFLT_PROT_COPY && aft_panic) {
+		aflt->flt_panic = 1;
+	}
+
+	/*
+	 * UE/BERR/TO: Call our bus nexus friends to check for
+	 * IO errors that may have resulted in this trap.
+	 */
+	if (t_afsr & (P_AFSR_TO | P_AFSR_BERR | P_AFSR_UE)) {
+		cpu_run_bus_error_handlers(aflt, expected);
+	}
+
+	/*
+	 * Handle UE: If the UE is in memory, we need to flush the bad line from
+	 * the E-cache.  We also need to query the bus nexus for fatal errors.
+	 * For sabre, we will panic on UEs. Attempts to do diagnostic read on
+	 * caches may introduce more parity errors (especially when the module
+	 * is bad) and in sabre there is no guarantee that such errors
+	 * (if introduced) are written back as poisoned data.
+	 */
+	if (t_afsr & P_AFSR_UE) {
+		int i;
+
+		(void) strcat(pr_reason, "UE ");
+
+		spf_flt.flt_type = CPU_UE_ERR;
+		aflt->flt_in_memory = (pf_is_memory(aflt->flt_addr >>
+			MMU_PAGESHIFT)) ? 1: 0;
+
+		/*
+		 * With UE, we have the PA of the fault.
+		 * Let do a diagnostic read to get the ecache
+		 * data and tag info of the bad line for logging.
+		 */
+		if (aflt->flt_in_memory) {
+			uint32_t ec_set_size;
+			uchar_t state;
+			uint32_t ecache_idx;
+			uint64_t faultpa = P2ALIGN(aflt->flt_addr, 64);
+
+			/* touch the line to put it in ecache */
+			acc_afsr |= read_and_clear_afsr();
+			(void) lddphys(faultpa);
+			acc_afsr |= (read_and_clear_afsr() &
+				    ~(P_AFSR_EDP | P_AFSR_UE));
+
+			ec_set_size = cpunodes[CPU->cpu_id].ecache_size /
+			    ecache_associativity;
+
+			for (i = 0; i < ecache_associativity; i++) {
+				ecache_idx = i * ec_set_size +
+				    (aflt->flt_addr % ec_set_size);
+				get_ecache_dtag(P2ALIGN(ecache_idx, 64),
+					(uint64_t *)&spf_flt.flt_ec_data[0],
+					&spf_flt.flt_ec_tag, &oafsr, &acc_afsr);
+				acc_afsr |= oafsr;
+
+				state = (uchar_t)((spf_flt.flt_ec_tag &
+				    cpu_ec_state_mask) >> cpu_ec_state_shift);
+
+				if ((state & cpu_ec_state_valid) &&
+				    ((spf_flt.flt_ec_tag & cpu_ec_tag_mask) ==
+				    ((uint64_t)aflt->flt_addr >>
+				    cpu_ec_tag_shift)))
+					break;
+			}
+
+			/*
+			 * Check to see if the ecache tag is valid for the
+			 * fault PA. In the very unlikely event where the
+			 * line could be victimized, no ecache info will be
+			 * available. If this is the case, capture the line
+			 * from memory instead.
+			 */
+			if ((state & cpu_ec_state_valid) == 0 ||
+			    (spf_flt.flt_ec_tag & cpu_ec_tag_mask) !=
+			    ((uint64_t)aflt->flt_addr >> cpu_ec_tag_shift)) {
+				for (i = 0; i < 8; i++, faultpa += 8) {
+				    ec_data_t *ecdptr;
+
+					ecdptr = &spf_flt.flt_ec_data[i];
+					acc_afsr |= read_and_clear_afsr();
+					ecdptr->ec_d8 = lddphys(faultpa);
+					acc_afsr |= (read_and_clear_afsr() &
+						    ~(P_AFSR_EDP | P_AFSR_UE));
+					ecdptr->ec_afsr = 0;
+							/* null afsr value */
+				}
+
+				/*
+				 * Mark tag invalid to indicate mem dump
+				 * when we print out the info.
+				 */
+				spf_flt.flt_ec_tag = AFLT_INV_ADDR;
+			}
+			spf_flt.flt_ec_lcnt = 1;
+
+			/*
+			 * Flush out the bad line
+			 */
+			flushecacheline(P2ALIGN(aflt->flt_addr, 64),
+				cpunodes[CPU->cpu_id].ecache_size);
+
+			acc_afsr |= clear_errors(NULL, NULL);
+		}
+
+		/*
+		 * Ask our bus nexus friends if they have any fatal errors. If
+		 * so, they will log appropriate error messages and panic as a
+		 * result. We then queue an event for each UDB that reports a
+		 * UE. Each UE reported in a UDB will have its own log message.
+		 *
+		 * Note from kbn: In the case where there are multiple UEs
+		 * (ME bit is set) - the AFAR address is only accurate to
+		 * the 16-byte granularity. One cannot tell whether the AFAR
+		 * belongs to the UDBH or UDBL syndromes. In this case, we
+		 * always report the AFAR address to be 16-byte aligned.
+		 *
+		 * If we're on a Sabre, there is no SDBL, but it will always
+		 * read as zero, so the sdbl test below will safely fail.
+		 */
+		if (bus_func_invoke(BF_TYPE_UE) == BF_FATAL || isus2i || isus2e)
+			aflt->flt_panic = 1;
+
+		if (sdbh & P_DER_UE) {
+			aflt->flt_synd = sdbh & P_DER_E_SYND;
+			cpu_errorq_dispatch(FM_EREPORT_CPU_USII_UE,
+			    (void *)&spf_flt, sizeof (spf_flt), ue_queue,
+			    aflt->flt_panic);
+		}
+		if (sdbl & P_DER_UE) {
+			aflt->flt_synd = sdbl & P_DER_E_SYND;
+			aflt->flt_synd |= UDBL_REG;	/* indicates UDBL */
+			if (!(aflt->flt_stat & P_AFSR_ME))
+				aflt->flt_addr |= 0x8;
+			cpu_errorq_dispatch(FM_EREPORT_CPU_USII_UE,
+			    (void *)&spf_flt, sizeof (spf_flt), ue_queue,
+			    aflt->flt_panic);
+		}
+
+		/*
+		 * We got a UE and are panicking, save the fault PA in a known
+		 * location so that the platform specific panic code can check
+		 * for copyback errors.
+		 */
+		if (aflt->flt_panic && aflt->flt_in_memory) {
+			panic_aflt = *aflt;
+		}
+	}
+
+	/*
+	 * Handle EDP and LDP: Locate the line with bad parity and enqueue an
+	 * async error for logging. For Sabre, we panic on EDP or LDP.
+	 */
+	if (t_afsr & (P_AFSR_EDP | P_AFSR_LDP)) {
+		spf_flt.flt_type = CPU_EDP_LDP_ERR;
+
+		if (t_afsr & P_AFSR_EDP)
+			(void) strcat(pr_reason, "EDP ");
+
+		if (t_afsr & P_AFSR_LDP)
+			(void) strcat(pr_reason, "LDP ");
+
+		/*
+		 * Here we have no PA to work with.
+		 * Scan each line in the ecache to look for
+		 * the one with bad parity.
+		 */
+		aflt->flt_addr = AFLT_INV_ADDR;
+		scan_ecache(&aflt->flt_addr, &spf_flt.flt_ec_data[0],
+			&spf_flt.flt_ec_tag, &spf_flt.flt_ec_lcnt, &oafsr);
+		acc_afsr |= (oafsr & ~P_AFSR_WP);
+
+		/*
+		 * If we found a bad PA, update the state to indicate if it is
+		 * memory or I/O space.  This code will be important if we ever
+		 * support cacheable frame buffers.
+		 */
+		if (aflt->flt_addr != AFLT_INV_ADDR) {
+			aflt->flt_in_memory = (pf_is_memory(aflt->flt_addr >>
+				MMU_PAGESHIFT)) ? 1 : 0;
+		}
+
+		if (isus2i || isus2e)
+			aflt->flt_panic = 1;
+
+		cpu_errorq_dispatch((t_afsr & P_AFSR_EDP) ?
+		    FM_EREPORT_CPU_USII_EDP : FM_EREPORT_CPU_USII_LDP,
+		    (void *)&spf_flt, sizeof (spf_flt), ue_queue,
+		    aflt->flt_panic);
+	}
+
+	/*
+	 * Timeout and bus error handling.  There are two cases to consider:
+	 *
+	 * (1) If we are in the kernel protected by ddi_peek or ddi_poke,we
+	 * have already modified the saved registers so that we will return
+	 * from the trap to the appropriate trampoline routine; otherwise panic.
+	 *
+	 * (2) In user mode, we can simply use our AST mechanism to deliver
+	 * a SIGBUS.  We do not log the occurence - processes performing
+	 * device control would generate lots of uninteresting messages.
+	 */
+	if (t_afsr & (P_AFSR_TO | P_AFSR_BERR)) {
+		if (t_afsr & P_AFSR_TO)
+			(void) strcat(pr_reason, "BTO ");
+
+		if (t_afsr & P_AFSR_BERR)
+			(void) strcat(pr_reason, "BERR ");
+
+		spf_flt.flt_type = CPU_BTO_BERR_ERR;
+		if (aflt->flt_priv && aflt->flt_prot == AFLT_PROT_NONE) {
+			cpu_errorq_dispatch((t_afsr & P_AFSR_TO) ?
+			    FM_EREPORT_CPU_USII_TO : FM_EREPORT_CPU_USII_BERR,
+			    (void *)&spf_flt, sizeof (spf_flt), ue_queue,
+			    aflt->flt_panic);
+		}
+	}
+
+	/*
+	 * Handle WP: WP happens when the ecache is victimized and a parity
+	 * error was detected on a writeback.  The data in question will be
+	 * poisoned as a UE will be written back.  The PA is not logged and
+	 * it is possible that it doesn't belong to the trapped thread.  The
+	 * WP trap is not fatal, but it could be fatal to someone that
+	 * subsequently accesses the toxic page.  We set read_all_memscrub
+	 * to force the memscrubber to read all of memory when it awakens.
+	 * For Sabre/Hummingbird, WP is fatal because the HW doesn't write a
+	 * UE back to poison the data.
+	 */
+	if (t_afsr & P_AFSR_WP) {
+		(void) strcat(pr_reason, "WP ");
+		if (isus2i || isus2e) {
+			aflt->flt_panic = 1;
+		} else {
+			read_all_memscrub = 1;
+		}
+		spf_flt.flt_type = CPU_WP_ERR;
+		cpu_errorq_dispatch(FM_EREPORT_CPU_USII_WP,
+		    (void *)&spf_flt, sizeof (spf_flt), ue_queue,
+		    aflt->flt_panic);
+	}
+
+	/*
+	 * Handle trapping CP error: In Sabre/Hummingbird, parity error in
+	 * the ecache on a copyout due to a PCI DMA read is signaled as a CP.
+	 * This is fatal.
+	 */
+
+	if (t_afsr & P_AFSR_CP) {
+		if (isus2i || isus2e) {
+			(void) strcat(pr_reason, "CP ");
+			aflt->flt_panic = 1;
+			spf_flt.flt_type = CPU_TRAPPING_CP_ERR;
+			cpu_errorq_dispatch(FM_EREPORT_CPU_USII_CP,
+			    (void *)&spf_flt, sizeof (spf_flt), ue_queue,
+			    aflt->flt_panic);
+		} else {
+			/*
+			 * Orphan CP: Happens due to signal integrity problem
+			 * on a CPU, where a CP is reported, without reporting
+			 * its associated UE. This is handled by locating the
+			 * bad parity line and would kick off the memscrubber
+			 * to find the UE if in memory or in another's cache.
+			 */
+			spf_flt.flt_type = CPU_ORPHAN_CP_ERR;
+			(void) strcat(pr_reason, "ORPHAN_CP ");
+
+			/*
+			 * Here we have no PA to work with.
+			 * Scan each line in the ecache to look for
+			 * the one with bad parity.
+			 */
+			aflt->flt_addr = AFLT_INV_ADDR;
+			scan_ecache(&aflt->flt_addr, &spf_flt.flt_ec_data[0],
+				&spf_flt.flt_ec_tag, &spf_flt.flt_ec_lcnt,
+				&oafsr);
+			acc_afsr |= oafsr;
+
+			/*
+			 * If we found a bad PA, update the state to indicate
+			 * if it is memory or I/O space.
+			 */
+			if (aflt->flt_addr != AFLT_INV_ADDR) {
+				aflt->flt_in_memory =
+					(pf_is_memory(aflt->flt_addr >>
+						MMU_PAGESHIFT)) ? 1 : 0;
+			}
+			read_all_memscrub = 1;
+			cpu_errorq_dispatch(FM_EREPORT_CPU_USII_CP,
+			    (void *)&spf_flt, sizeof (spf_flt), ue_queue,
+			    aflt->flt_panic);
+
+		}
+	}
+
+	/*
+	 * If we queued an error other than WP or CP and we are going to return
+	 * from the trap and the error was in user mode or inside of a
+	 * copy routine, set AST flag so the queue will be drained before
+	 * returning to user mode.
+	 *
+	 * For UE/LDP/EDP, the AST processing will SIGKILL the process
+	 * and send an event to its process contract.
+	 *
+	 * For BERR/BTO, the AST processing will SIGBUS the process.  There
+	 * will have been no error queued in this case.
+	 */
+	if ((t_afsr &
+	    (P_AFSR_UE | P_AFSR_LDP | P_AFSR_EDP | P_AFSR_BERR | P_AFSR_TO)) &&
+	    (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY)) {
+			int pcb_flag = 0;
+
+			if (t_afsr & (P_AFSR_UE | P_AFSR_LDP | P_AFSR_EDP))
+				pcb_flag |= ASYNC_HWERR;
+
+			if (t_afsr & P_AFSR_BERR)
+				pcb_flag |= ASYNC_BERR;
+
+			if (t_afsr & P_AFSR_TO)
+				pcb_flag |= ASYNC_BTO;
+
+			ttolwp(curthread)->lwp_pcb.pcb_flags |= pcb_flag;
+			aston(curthread);
+			action = ACTION_AST_FLAGS;
+	}
+
+	/*
+	 * In response to a deferred error, we must do one of three things:
+	 * (1) set the AST flags, (2) trampoline, or (3) panic.  action is
+	 * set in cases (1) and (2) - check that either action is set or
+	 * (3) is true.
+	 *
+	 * On II, the WP writes poisoned data back to memory, which will
+	 * cause a UE and a panic or reboot when read.  In this case, we
+	 * don't need to panic at this time.  On IIi and IIe,
+	 * aflt->flt_panic is already set above.
+	 */
+	ASSERT((aflt->flt_panic != 0) || (action != ACTION_NONE) ||
+	    (t_afsr & P_AFSR_WP));
+
+	/*
+	 * Make a final sanity check to make sure we did not get any more async
+	 * errors and accumulate the afsr.
+	 */
+	flush_ecache(ecache_flushaddr, cpunodes[CPU->cpu_id].ecache_size * 2,
+	    cpunodes[CPU->cpu_id].ecache_linesize);
+	(void) clear_errors(&spf_flt, NULL);
+
+	/*
+	 * Take care of a special case: If there is a UE in the ecache flush
+	 * area, we'll see it in flush_ecache().  This will trigger the
+	 * CPU_ADDITIONAL_ERRORS case below.
+	 *
+	 * This could occur if the original error was a UE in the flush area,
+	 * or if the original error was an E$ error that was flushed out of
+	 * the E$ in scan_ecache().
+	 *
+	 * If it's at the same address that we're already logging, then it's
+	 * probably one of these cases.  Clear the bit so we don't trip over
+	 * it on the additional errors case, which could cause an unnecessary
+	 * panic.
+	 */
+	if ((aflt->flt_stat & P_AFSR_UE) && aflt->flt_addr == t_afar)
+		acc_afsr |= aflt->flt_stat & ~P_AFSR_UE;
+	else
+		acc_afsr |= aflt->flt_stat;
+
+	/*
+	 * Check the acumulated afsr for the important bits.
+	 * Make sure the spf_flt.flt_type value is set, and
+	 * enque an error.
+	 */
+	if (acc_afsr &
+	    (P_AFSR_LEVEL1 | P_AFSR_IVUE | P_AFSR_ETP | P_AFSR_ISAP)) {
+		if (acc_afsr & (P_AFSR_UE | P_AFSR_EDP | P_AFSR_LDP |
+		    P_AFSR_BERR | P_AFSR_TO | P_AFSR_IVUE | P_AFSR_ETP |
+		    P_AFSR_ISAP))
+			aflt->flt_panic = 1;
+
+		spf_flt.flt_type = CPU_ADDITIONAL_ERR;
+		aflt->flt_stat = acc_afsr;
+		cpu_errorq_dispatch(FM_EREPORT_CPU_USII_UNKNOWN,
+		    (void *)&spf_flt, sizeof (spf_flt), ue_queue,
+		    aflt->flt_panic);
+	}
+
+	/*
+	 * If aflt->flt_panic is set at this point, we need to panic as the
+	 * result of a trap at TL > 0, or an error we determined to be fatal.
+	 * We've already enqueued the error in one of the if-clauses above,
+	 * and it will be dequeued and logged as part of the panic flow.
+	 */
+	if (aflt->flt_panic) {
+		cpu_aflt_log(CE_PANIC, 1, &spf_flt, CPU_ERRID_FIRST,
+		    "See previous message(s) for details", " %sError(s)",
+		    pr_reason);
+	}
+
+	/*
+	 * Before returning, we must re-enable errors, and
+	 * reset the caches to their boot-up state.
+	 */
+	set_lsu(get_lsu() | cache_boot_state);
+	set_error_enable(EER_ENABLE);
+}
+
+/*
+ * Check for miscellaneous fatal errors and call CE_PANIC if any are seen.
+ * This routine is shared by the CE and UE handling code.
+ */
+static void
+check_misc_err(spitf_async_flt *spf_flt)
+{
+	struct async_flt *aflt = (struct async_flt *)spf_flt;
+	char *fatal_str = NULL;
+
+	/*
+	 * The ISAP and ETP errors are supposed to cause a POR
+	 * from the system, so in theory we never, ever see these messages.
+	 * ISAP, ETP and IVUE are considered to be fatal.
+	 */
+	if (aflt->flt_stat & P_AFSR_ISAP)
+		fatal_str = " System Address Parity Error on";
+	else if (aflt->flt_stat & P_AFSR_ETP)
+		fatal_str = " Ecache Tag Parity Error on";
+	else if (aflt->flt_stat & P_AFSR_IVUE)
+		fatal_str = " Interrupt Vector Uncorrectable Error on";
+	if (fatal_str != NULL) {
+		cpu_aflt_log(CE_PANIC, 1, spf_flt, CMN_LFLAGS,
+			NULL, fatal_str);
+	}
+}
+
+/*
+ * Routine to convert a syndrome into a syndrome code.
+ */
+static int
+synd_to_synd_code(int synd_status, ushort_t synd)
+{
+	if (synd_status != AFLT_STAT_VALID)
+		return (-1);
+
+	/*
+	 * Use the 8-bit syndrome to index the ecc_syndrome_tab
+	 * to get the code indicating which bit(s) is(are) bad.
+	 */
+	if ((synd == 0) || (synd >= SYND_TBL_SIZE))
+		return (-1);
+	else
+		return (ecc_syndrome_tab[synd]);
+}
+
+/*
+ * Routine to return a string identifying the physical name
+ * associated with a memory/cache error.
+ */
+/* ARGSUSED */
+int
+cpu_get_mem_unum(int synd_status, ushort_t synd, uint64_t afsr,
+    uint64_t afar, int cpuid, int flt_in_memory, ushort_t flt_status,
+    char *buf, int buflen, int *lenp)
+{
+	short synd_code;
+	int ret;
+
+	if (flt_in_memory) {
+		synd_code = synd_to_synd_code(synd_status, synd);
+		if (synd_code == -1) {
+			ret = EINVAL;
+		} else if (prom_get_unum(synd_code, P2ALIGN(afar, 8),
+		    buf, buflen, lenp) != 0) {
+			ret = EIO;
+		} else if (*lenp <= 1) {
+			ret = EINVAL;
+		} else {
+			ret = 0;
+		}
+	} else {
+		ret = ENOTSUP;
+	}
+
+	if (ret != 0) {
+		buf[0] = '\0';
+		*lenp = 0;
+	}
+
+	return (ret);
+}
+
+/*
+ * Wrapper for cpu_get_mem_unum() routine that takes an
+ * async_flt struct rather than explicit arguments.
+ */
+int
+cpu_get_mem_unum_aflt(int synd_status, struct async_flt *aflt,
+    char *buf, int buflen, int *lenp)
+{
+	return (cpu_get_mem_unum(synd_status, SYND(aflt->flt_synd),
+		aflt->flt_stat, aflt->flt_addr, aflt->flt_bus_id,
+		aflt->flt_in_memory, aflt->flt_status, buf, buflen, lenp));
+}
+
+/*
+ * This routine is a more generic interface to cpu_get_mem_unum(),
+ * that may be used by other modules (e.g. mm).
+ */
+int
+cpu_get_mem_name(uint64_t synd, uint64_t *afsr, uint64_t afar,
+		char *buf, int buflen, int *lenp)
+{
+	int synd_status, flt_in_memory, ret;
+	char unum[UNUM_NAMLEN];
+
+	/*
+	 * Check for an invalid address.
+	 */
+	if (afar == (uint64_t)-1)
+		return (ENXIO);
+
+	if (synd == (uint64_t)-1)
+		synd_status = AFLT_STAT_INVALID;
+	else
+		synd_status = AFLT_STAT_VALID;
+
+	flt_in_memory = (pf_is_memory(afar >> MMU_PAGESHIFT)) ? 1 : 0;
+
+	if ((ret = cpu_get_mem_unum(synd_status, (ushort_t)synd, *afsr, afar,
+	    CPU->cpu_id, flt_in_memory, 0, unum, UNUM_NAMLEN, lenp))
+	    != 0)
+		return (ret);
+
+	if (*lenp >= buflen)
+		return (ENAMETOOLONG);
+
+	(void) strncpy(buf, unum, buflen);
+
+	return (0);
+}
+
+/*
+ * Routine to return memory information associated
+ * with a physical address and syndrome.
+ */
+/* ARGSUSED */
+int
+cpu_get_mem_info(uint64_t synd, uint64_t afar,
+    uint64_t *mem_sizep, uint64_t *seg_sizep, uint64_t *bank_sizep,
+    int *segsp, int *banksp, int *mcidp)
+{
+	return (ENOTSUP);
+}
+
+/*
+ * Routine to return a string identifying the physical
+ * name associated with a cpuid.
+ */
+/* ARGSUSED */
+int
+cpu_get_cpu_unum(int cpuid, char *buf, int buflen, int *lenp)
+{
+	return (ENOTSUP);
+}
+
+/*
+ * This routine returns the size of the kernel's FRU name buffer.
+ */
+size_t
+cpu_get_name_bufsize()
+{
+	return (UNUM_NAMLEN);
+}
+
+/*
+ * Cpu specific log func for UEs.
+ */
+static void
+log_ue_err(struct async_flt *aflt, char *unum)
+{
+	spitf_async_flt *spf_flt = (spitf_async_flt *)aflt;
+	int len = 0;
+
+#ifdef DEBUG
+	int afsr_priv = (aflt->flt_stat & P_AFSR_PRIV) ? 1 : 0;
+
+	/*
+	 * Paranoid Check for priv mismatch
+	 * Only applicable for UEs
+	 */
+	if (afsr_priv != aflt->flt_priv) {
+		/*
+		 * The priv bits in %tstate and %afsr did not match; we expect
+		 * this to be very rare, so flag it with a message.
+		 */
+		cpu_aflt_log(CE_WARN, 2, spf_flt, CPU_ERRID_FIRST, NULL,
+		    ": PRIV bit in TSTATE and AFSR mismatched; "
+		    "TSTATE.PRIV=%d used", (aflt->flt_priv) ? 1 : 0);
+
+		/* update saved afsr to reflect the correct priv */
+		aflt->flt_stat &= ~P_AFSR_PRIV;
+		if (aflt->flt_priv)
+			aflt->flt_stat |= P_AFSR_PRIV;
+	}
+#endif /* DEBUG */
+
+	(void) cpu_get_mem_unum_aflt(AFLT_STAT_VALID, aflt, unum,
+	    UNUM_NAMLEN, &len);
+
+	cpu_aflt_log(CE_WARN, 1, spf_flt, UE_LFLAGS, unum,
+	    " Uncorrectable Memory Error on");
+
+	if (SYND(aflt->flt_synd) == 0x3) {
+		cpu_aflt_log(CE_WARN, 1, spf_flt, CPU_ERRID_FIRST, NULL,
+		    " Syndrome 0x3 indicates that this may not be a "
+		    "memory module problem");
+	}
+
+	if (aflt->flt_in_memory)
+		cpu_log_ecmem_info(spf_flt);
+}
+
+
+/*
+ * The cpu_async_log_err() function is called via the ue_drain() function to
+ * handle logging for CPU events that are dequeued.  As such, it can be invoked
+ * from softint context, from AST processing in the trap() flow, or from the
+ * panic flow.  We decode the CPU-specific data, and log appropriate messages.
+ */
+static void
+cpu_async_log_err(void *flt)
+{
+	spitf_async_flt *spf_flt = (spitf_async_flt *)flt;
+	struct async_flt *aflt = (struct async_flt *)flt;
+	char unum[UNUM_NAMLEN];
+	char *space;
+	char *ecache_scrub_logstr = NULL;
+
+	switch (spf_flt->flt_type) {
+	    case CPU_UE_ERR:
+		/*
+		 * We want to skip logging only if ALL the following
+		 * conditions are true:
+		 *
+		 *	1. We are not panicking
+		 *	2. There is only one error
+		 *	3. That error is a memory error
+		 *	4. The error is caused by the memory scrubber (in
+		 *	   which case the error will have occurred under
+		 *	   on_trap protection)
+		 *	5. The error is on a retired page
+		 *
+		 * Note 1: AFLT_PROT_EC is used places other than the memory
+		 * scrubber.  However, none of those errors should occur
+		 * on a retired page.
+		 *
+		 * Note 2: In the CE case, these errors are discarded before
+		 * the errorq.  In the UE case, we must wait until now --
+		 * softcall() grabs a mutex, which we can't do at a high PIL.
+		 */
+		if (!panicstr &&
+		    (aflt->flt_stat & S_AFSR_ALL_ERRS) == P_AFSR_UE &&
+		    aflt->flt_prot == AFLT_PROT_EC) {
+			page_t *pp = page_numtopp_nolock((pfn_t)
+			    (aflt->flt_addr >> MMU_PAGESHIFT));
+
+			if (pp != NULL && page_isretired(pp)) {
+
+				/* Zero the address to clear the error */
+				softcall(ecc_page_zero, (void *)aflt->flt_addr);
+				return;
+			}
+		}
+
+		/*
+		 * Log the UE and check for causes of this UE error that
+		 * don't cause a trap (Copyback error).  cpu_async_error()
+		 * has already checked the i/o buses for us.
+		 */
+		log_ue_err(aflt, unum);
+		if (aflt->flt_in_memory)
+			cpu_check_allcpus(aflt);
+		break;
+
+	    case CPU_EDP_LDP_ERR:
+		if (aflt->flt_stat & P_AFSR_EDP)
+			cpu_aflt_log(CE_WARN, 1, spf_flt, PARERR_LFLAGS,
+			    NULL, " EDP event on");
+
+		if (aflt->flt_stat & P_AFSR_LDP)
+			cpu_aflt_log(CE_WARN, 1, spf_flt, PARERR_LFLAGS,
+			    NULL, " LDP event on");
+
+		/* Log ecache info if exist */
+		if (spf_flt->flt_ec_lcnt > 0) {
+			cpu_log_ecmem_info(spf_flt);
+
+			cpu_aflt_log(CE_CONT, 2, spf_flt, CPU_ERRID_FIRST,
+			    NULL, " AFAR was derived from E$Tag");
+		} else {
+			cpu_aflt_log(CE_CONT, 2, spf_flt, CPU_ERRID_FIRST,
+			    NULL, " No error found in ecache (No fault "
+			    "PA available)");
+		}
+		break;
+
+	    case CPU_WP_ERR:
+		/*
+		 * If the memscrub thread hasn't yet read
+		 * all of memory, as we requested in the
+		 * trap handler, then give it a kick to
+		 * make sure it does.
+		 */
+		if (!isus2i && !isus2e && read_all_memscrub)
+			memscrub_run();
+
+		cpu_aflt_log(CE_WARN, 1, spf_flt, WP_LFLAGS, NULL,
+		    " WP event on");
+		return;
+
+	    case CPU_BTO_BERR_ERR:
+		/*
+		 * A bus timeout or error occurred that was in user mode or not
+		 * in a protected kernel code region.
+		 */
+		if (aflt->flt_stat & P_AFSR_BERR) {
+			cpu_aflt_log(CE_WARN, aflt->flt_panic ? 1 : 2,
+			    spf_flt, BERRTO_LFLAGS, NULL,
+			    " Bus Error on System Bus in %s mode from",
+			    aflt->flt_priv ? "privileged" : "user");
+		}
+
+		if (aflt->flt_stat & P_AFSR_TO) {
+			cpu_aflt_log(CE_WARN, aflt->flt_panic ? 1 : 2,
+			    spf_flt, BERRTO_LFLAGS, NULL,
+			    " Timeout on System Bus in %s mode from",
+			    aflt->flt_priv ? "privileged" : "user");
+		}
+
+		return;
+
+	    case CPU_PANIC_CP_ERR:
+		/*
+		 * Process the Copyback (CP) error info (if any) obtained from
+		 * polling all the cpus in the panic flow. This case is only
+		 * entered if we are panicking.
+		 */
+		ASSERT(panicstr != NULL);
+		ASSERT(aflt->flt_id == panic_aflt.flt_id);
+
+		/* See which space - this info may not exist */
+		if (panic_aflt.flt_status & ECC_D_TRAP)
+			space = "Data ";
+		else if (panic_aflt.flt_status & ECC_I_TRAP)
+			space = "Instruction ";
+		else
+			space = "";
+
+		cpu_aflt_log(CE_WARN, 1, spf_flt, CP_LFLAGS, NULL,
+		    " AFAR was derived from UE report,"
+		    " CP event on CPU%d (caused %saccess error on %s%d)",
+		    aflt->flt_inst, space, (panic_aflt.flt_status & ECC_IOBUS) ?
+		    "IOBUS" : "CPU", panic_aflt.flt_bus_id);
+
+		if (spf_flt->flt_ec_lcnt > 0)
+			cpu_log_ecmem_info(spf_flt);
+		else
+			cpu_aflt_log(CE_WARN, 2, spf_flt, CPU_ERRID_FIRST,
+			    NULL, " No cache dump available");
+
+		return;
+
+	    case CPU_TRAPPING_CP_ERR:
+		/*
+		 * For sabre only.  This is a copyback ecache parity error due
+		 * to a PCI DMA read.  We should be panicking if we get here.
+		 */
+		ASSERT(panicstr != NULL);
+		cpu_aflt_log(CE_WARN, 1, spf_flt, CP_LFLAGS, NULL,
+		    " AFAR was derived from UE report,"
+		    " CP event on CPU%d (caused Data access error "
+		    "on PCIBus)", aflt->flt_inst);
+		return;
+
+		/*
+		 * We log the ecache lines of the following states,
+		 * clean_bad_idle, clean_bad_busy, dirty_bad_idle and
+		 * dirty_bad_busy if ecache_scrub_verbose is set and panic
+		 * in addition to logging if ecache_scrub_panic is set.
+		 */
+	    case CPU_BADLINE_CI_ERR:
+		ecache_scrub_logstr = "CBI";
+		/* FALLTHRU */
+
+	    case CPU_BADLINE_CB_ERR:
+		if (ecache_scrub_logstr == NULL)
+			ecache_scrub_logstr = "CBB";
+		/* FALLTHRU */
+
+	    case CPU_BADLINE_DI_ERR:
+		if (ecache_scrub_logstr == NULL)
+			ecache_scrub_logstr = "DBI";
+		/* FALLTHRU */
+
+	    case CPU_BADLINE_DB_ERR:
+		if (ecache_scrub_logstr == NULL)
+			ecache_scrub_logstr = "DBB";
+
+		cpu_aflt_log(CE_NOTE, 2, spf_flt,
+			(CPU_ERRID_FIRST | CPU_FLTCPU), NULL,
+			" %s event on", ecache_scrub_logstr);
+		cpu_log_ecmem_info(spf_flt);
+
+		return;
+
+	    case CPU_ORPHAN_CP_ERR:
+		/*
+		 * Orphan CPs, where the CP bit is set, but when a CPU
+		 * doesn't report a UE.
+		 */
+		if (read_all_memscrub)
+			memscrub_run();
+
+		cpu_aflt_log(CE_NOTE, 2, spf_flt, (CP_LFLAGS | CPU_FLTCPU),
+			NULL, " Orphan CP event on");
+
+		/* Log ecache info if exist */
+		if (spf_flt->flt_ec_lcnt > 0)
+			cpu_log_ecmem_info(spf_flt);
+		else
+			cpu_aflt_log(CE_NOTE, 2, spf_flt,
+				(CP_LFLAGS | CPU_FLTCPU), NULL,
+				" No error found in ecache (No fault "
+				"PA available");
+		return;
+
+	    case CPU_ECACHE_ADDR_PAR_ERR:
+		cpu_aflt_log(CE_WARN, 1, spf_flt, PARERR_LFLAGS, NULL,
+				" E$ Tag Address Parity error on");
+		cpu_log_ecmem_info(spf_flt);
+		return;
+
+	    case CPU_ECACHE_STATE_ERR:
+		cpu_aflt_log(CE_WARN, 1, spf_flt, PARERR_LFLAGS, NULL,
+				" E$ Tag State Parity error on");
+		cpu_log_ecmem_info(spf_flt);
+		return;
+
+	    case CPU_ECACHE_TAG_ERR:
+		cpu_aflt_log(CE_WARN, 1, spf_flt, PARERR_LFLAGS, NULL,
+				" E$ Tag scrub event on");
+		cpu_log_ecmem_info(spf_flt);
+		return;
+
+	    case CPU_ECACHE_ETP_ETS_ERR:
+		cpu_aflt_log(CE_WARN, 1, spf_flt, PARERR_LFLAGS, NULL,
+				" AFSR.ETP is set and AFSR.ETS is zero on");
+		cpu_log_ecmem_info(spf_flt);
+		return;
+
+
+	    case CPU_ADDITIONAL_ERR:
+		cpu_aflt_log(CE_WARN, 1, spf_flt, CMN_LFLAGS & ~CPU_SPACE, NULL,
+		    " Additional errors detected during error processing on");
+		return;
+
+	    default:
+		cmn_err(CE_WARN, "cpu_async_log_err: fault %p has unknown "
+		    "fault type %x", (void *)spf_flt, spf_flt->flt_type);
+		return;
+	}
+
+	/* ... fall through from the UE, EDP, or LDP cases */
+
+	if (aflt->flt_addr != AFLT_INV_ADDR && aflt->flt_in_memory) {
+		if (!panicstr) {
+			/*
+			 * Retire the bad page that caused the error
+			 */
+			page_t *pp = page_numtopp_nolock((pfn_t)
+			    (aflt->flt_addr >> MMU_PAGESHIFT));
+
+			if (pp != NULL) {
+				page_settoxic(pp, PAGE_IS_FAULTY);
+				(void) page_retire(pp, PAGE_IS_TOXIC);
+			} else {
+				uint64_t pa =
+				    P2ALIGN(aflt->flt_addr, MMU_PAGESIZE);
+
+				cpu_aflt_log(CE_CONT, 3, spf_flt,
+				    CPU_ERRID_FIRST, NULL,
+				    ": cannot schedule clearing of error on "
+				    "page 0x%08x.%08x; page not in VM system",
+				    (uint32_t)(pa >> 32), (uint32_t)pa);
+			}
+		} else {
+			/*
+			 * Clear UEs on panic so that we don't
+			 * get haunted by them during panic or
+			 * after reboot
+			 */
+			clearphys(P2ALIGN(aflt->flt_addr, 64),
+			    cpunodes[CPU->cpu_id].ecache_size,
+			    cpunodes[CPU->cpu_id].ecache_linesize);
+
+			(void) clear_errors(NULL, NULL);
+		}
+	}
+
+	/*
+	 * Log final recover message
+	 */
+	if (!panicstr) {
+		if (!aflt->flt_priv) {
+			cpu_aflt_log(CE_CONT, 3, spf_flt, CPU_ERRID_FIRST,
+			    NULL, " Above Error is in User Mode"
+			    "\n    and is fatal: "
+			    "will SIGKILL process and notify contract");
+		} else if (aflt->flt_prot == AFLT_PROT_COPY && aflt->flt_core) {
+			cpu_aflt_log(CE_CONT, 3, spf_flt, CPU_ERRID_FIRST,
+			    NULL, " Above Error detected while dumping core;"
+			    "\n    core file will be truncated");
+		} else if (aflt->flt_prot == AFLT_PROT_COPY) {
+			cpu_aflt_log(CE_CONT, 3, spf_flt, CPU_ERRID_FIRST,
+			    NULL, " Above Error is due to Kernel access"
+			    "\n    to User space and is fatal: "
+			    "will SIGKILL process and notify contract");
+		} else if (aflt->flt_prot == AFLT_PROT_EC) {
+			cpu_aflt_log(CE_CONT, 3, spf_flt, CPU_ERRID_FIRST, NULL,
+			    " Above Error detected by protected Kernel code"
+			    "\n    that will try to clear error from system");
+		}
+	}
+}
+
+
+/*
+ * Check all cpus for non-trapping UE-causing errors
+ * In Ultra I/II, we look for copyback errors (CPs)
+ */
+void
+cpu_check_allcpus(struct async_flt *aflt)
+{
+	spitf_async_flt cp;
+	spitf_async_flt *spf_cpflt = &cp;
+	struct async_flt *cpflt = (struct async_flt *)&cp;
+	int pix;
+
+	cpflt->flt_id = aflt->flt_id;
+	cpflt->flt_addr = aflt->flt_addr;
+
+	for (pix = 0; pix < NCPU; pix++) {
+		if (CPU_XCALL_READY(pix)) {
+			xc_one(pix, (xcfunc_t *)get_cpu_status,
+			    (uint64_t)cpflt, 0);
+
+			if (cpflt->flt_stat & P_AFSR_CP) {
+				char *space;
+
+				/* See which space - this info may not exist */
+				if (aflt->flt_status & ECC_D_TRAP)
+					space = "Data ";
+				else if (aflt->flt_status & ECC_I_TRAP)
+					space = "Instruction ";
+				else
+					space = "";
+
+				cpu_aflt_log(CE_WARN, 1, spf_cpflt, CP_LFLAGS,
+				    NULL, " AFAR was derived from UE report,"
+				    " CP event on CPU%d (caused %saccess "
+				    "error on %s%d)", pix, space,
+				    (aflt->flt_status & ECC_IOBUS) ?
+				    "IOBUS" : "CPU", aflt->flt_bus_id);
+
+				if (spf_cpflt->flt_ec_lcnt > 0)
+					cpu_log_ecmem_info(spf_cpflt);
+				else
+					cpu_aflt_log(CE_WARN, 2, spf_cpflt,
+					    CPU_ERRID_FIRST, NULL,
+					    " No cache dump available");
+			}
+		}
+	}
+}
+
+#ifdef DEBUG
+int test_mp_cp = 0;
+#endif
+
+/*
+ * Cross-call callback routine to tell a CPU to read its own %afsr to check
+ * for copyback errors and capture relevant information.
+ */
+static uint_t
+get_cpu_status(uint64_t arg)
+{
+	struct async_flt *aflt = (struct async_flt *)arg;
+	spitf_async_flt *spf_flt = (spitf_async_flt *)arg;
+	uint64_t afsr;
+	uint32_t ec_idx;
+	uint64_t sdbh, sdbl;
+	int i;
+	uint32_t ec_set_size;
+	uchar_t valid;
+	ec_data_t ec_data[8];
+	uint64_t ec_tag, flt_addr_tag, oafsr;
+	uint64_t *acc_afsr = NULL;
+
+	get_asyncflt(&afsr);
+	if (CPU_PRIVATE(CPU) != NULL) {
+		acc_afsr = CPU_PRIVATE_PTR(CPU, sfpr_scrub_afsr);
+		afsr |= *acc_afsr;
+		*acc_afsr = 0;
+	}
+
+#ifdef DEBUG
+	if (test_mp_cp)
+		afsr |= P_AFSR_CP;
+#endif
+	aflt->flt_stat = afsr;
+
+	if (afsr & P_AFSR_CP) {
+		/*
+		 * Capture the UDBs
+		 */
+		get_udb_errors(&sdbh, &sdbl);
+		spf_flt->flt_sdbh = (ushort_t)(sdbh & 0x3FF);
+		spf_flt->flt_sdbl = (ushort_t)(sdbl & 0x3FF);
+
+		/*
+		 * Clear CP bit before capturing ecache data
+		 * and AFSR info.
+		 */
+		set_asyncflt(P_AFSR_CP);
+
+		/*
+		 * See if we can capture the ecache line for the
+		 * fault PA.
+		 *
+		 * Return a valid matching ecache line, if any.
+		 * Otherwise, return the first matching ecache
+		 * line marked invalid.
+		 */
+		flt_addr_tag = aflt->flt_addr >> cpu_ec_tag_shift;
+		ec_set_size = cpunodes[CPU->cpu_id].ecache_size /
+		    ecache_associativity;
+		spf_flt->flt_ec_lcnt = 0;
+
+		for (i = 0, ec_idx = (aflt->flt_addr % ec_set_size);
+		    i < ecache_associativity; i++, ec_idx += ec_set_size) {
+			get_ecache_dtag(P2ALIGN(ec_idx, 64),
+				(uint64_t *)&ec_data[0], &ec_tag, &oafsr,
+				    acc_afsr);
+
+			if ((ec_tag & cpu_ec_tag_mask) != flt_addr_tag)
+				continue;
+
+			valid = cpu_ec_state_valid &
+			    (uchar_t)((ec_tag & cpu_ec_state_mask) >>
+			    cpu_ec_state_shift);
+
+			if (valid || spf_flt->flt_ec_lcnt == 0) {
+				spf_flt->flt_ec_tag = ec_tag;
+				bcopy(&ec_data, &spf_flt->flt_ec_data,
+				    sizeof (ec_data));
+				spf_flt->flt_ec_lcnt = 1;
+
+				if (valid)
+					break;
+			}
+		}
+	}
+	return (0);
+}
+
+/*
+ * CPU-module callback for the non-panicking CPUs.  This routine is invoked
+ * from panic_idle() as part of the other CPUs stopping themselves when a
+ * panic occurs.  We need to be VERY careful what we do here, since panicstr
+ * is NOT set yet and we cannot blow through locks.  If panic_aflt is set
+ * (panic_aflt.flt_id is non-zero), we need to read our %afsr to look for
+ * CP error information.
+ */
+void
+cpu_async_panic_callb(void)
+{
+	spitf_async_flt cp;
+	struct async_flt *aflt = (struct async_flt *)&cp;
+	uint64_t *scrub_afsr;
+
+	if (panic_aflt.flt_id != 0) {
+		aflt->flt_addr = panic_aflt.flt_addr;
+		(void) get_cpu_status((uint64_t)aflt);
+
+		if (CPU_PRIVATE(CPU) != NULL) {
+			scrub_afsr = CPU_PRIVATE_PTR(CPU, sfpr_scrub_afsr);
+			if (*scrub_afsr & P_AFSR_CP) {
+				aflt->flt_stat |= *scrub_afsr;
+				*scrub_afsr = 0;
+			}
+		}
+		if (aflt->flt_stat & P_AFSR_CP) {
+			aflt->flt_id = panic_aflt.flt_id;
+			aflt->flt_panic = 1;
+			aflt->flt_inst = CPU->cpu_id;
+			aflt->flt_class = CPU_FAULT;
+			cp.flt_type = CPU_PANIC_CP_ERR;
+			cpu_errorq_dispatch(FM_EREPORT_CPU_USII_CP,
+			    (void *)&cp, sizeof (cp), ue_queue,
+			    aflt->flt_panic);
+		}
+	}
+}
+
+/*
+ * Turn off all cpu error detection, normally only used for panics.
+ */
+void
+cpu_disable_errors(void)
+{
+	xt_all(set_error_enable_tl1, EER_DISABLE, EER_SET_ABSOLUTE);
+}
+
+/*
+ * Enable errors.
+ */
+void
+cpu_enable_errors(void)
+{
+	xt_all(set_error_enable_tl1, EER_ENABLE, EER_SET_ABSOLUTE);
+}
+
+static void
+cpu_read_paddr(struct async_flt *ecc, short verbose, short ce_err)
+{
+	uint64_t aligned_addr = P2ALIGN(ecc->flt_addr, 8);
+	int i, loop = 1;
+	ushort_t ecc_0;
+	uint64_t paddr;
+	uint64_t data;
+
+	if (verbose)
+		loop = 8;
+	for (i = 0; i < loop; i++) {
+		paddr = aligned_addr + (i * 8);
+		data = lddphys(paddr);
+		if (verbose) {
+			if (ce_err) {
+			    ecc_0 = ecc_gen((uint32_t)(data>>32),
+			    (uint32_t)data);
+			    cpu_aflt_log(CE_CONT, 0, NULL, NO_LFLAGS,
+				NULL, "    Paddr 0x%" PRIx64 ", "
+				"Data 0x%08x.%08x, ECC 0x%x", paddr,
+				(uint32_t)(data>>32), (uint32_t)data, ecc_0);
+			} else {
+				cpu_aflt_log(CE_CONT, 0, NULL, NO_LFLAGS,
+				    NULL, "    Paddr 0x%" PRIx64 ", "
+				    "Data 0x%08x.%08x", paddr,
+				    (uint32_t)(data>>32), (uint32_t)data);
+			}
+		}
+	}
+}
+
+static struct {		/* sec-ded-s4ed ecc code */
+	uint_t hi, lo;
+} ecc_code[8] = {
+	{ 0xee55de23U, 0x16161161U },
+	{ 0x55eede93U, 0x61612212U },
+	{ 0xbb557b8cU, 0x49494494U },
+	{ 0x55bb7b6cU, 0x94948848U },
+	{ 0x16161161U, 0xee55de23U },
+	{ 0x61612212U, 0x55eede93U },
+	{ 0x49494494U, 0xbb557b8cU },
+	{ 0x94948848U, 0x55bb7b6cU }
+};
+
+static ushort_t
+ecc_gen(uint_t high_bytes, uint_t low_bytes)
+{
+	int i, j;
+	uchar_t checker, bit_mask;
+	struct {
+		uint_t hi, lo;
+	} hex_data, masked_data[8];
+
+	hex_data.hi = high_bytes;
+	hex_data.lo = low_bytes;
+
+	/* mask out bits according to sec-ded-s4ed ecc code */
+	for (i = 0; i < 8; i++) {
+		masked_data[i].hi = hex_data.hi & ecc_code[i].hi;
+		masked_data[i].lo = hex_data.lo & ecc_code[i].lo;
+	}
+
+	/*
+	 * xor all bits in masked_data[i] to get bit_i of checker,
+	 * where i = 0 to 7
+	 */
+	checker = 0;
+	for (i = 0; i < 8; i++) {
+		bit_mask = 1 << i;
+		for (j = 0; j < 32; j++) {
+			if (masked_data[i].lo & 1) checker ^= bit_mask;
+			if (masked_data[i].hi & 1) checker ^= bit_mask;
+			masked_data[i].hi >>= 1;
+			masked_data[i].lo >>= 1;
+		}
+	}
+	return (checker);
+}
+
+/*
+ * Flush the entire ecache using displacement flush by reading through a
+ * physical address range as large as the ecache.
+ */
+void
+cpu_flush_ecache(void)
+{
+	flush_ecache(ecache_flushaddr, cpunodes[CPU->cpu_id].ecache_size * 2,
+	    cpunodes[CPU->cpu_id].ecache_linesize);
+}
+
+/*
+ * read and display the data in the cache line where the
+ * original ce error occurred.
+ * This routine is mainly used for debugging new hardware.
+ */
+void
+read_ecc_data(struct async_flt *ecc, short verbose, short ce_err)
+{
+	kpreempt_disable();
+	/* disable ECC error traps */
+	set_error_enable(EER_ECC_DISABLE);
+
+	/*
+	 * flush the ecache
+	 * read the data
+	 * check to see if an ECC error occured
+	 */
+	flush_ecache(ecache_flushaddr, cpunodes[CPU->cpu_id].ecache_size * 2,
+	    cpunodes[CPU->cpu_id].ecache_linesize);
+	set_lsu(get_lsu() | cache_boot_state);
+	cpu_read_paddr(ecc, verbose, ce_err);
+	(void) check_ecc(ecc);
+
+	/* enable ECC error traps */
+	set_error_enable(EER_ENABLE);
+	kpreempt_enable();
+}
+
+/*
+ * Check the AFSR bits for UE/CE persistence.
+ * If UE or CE errors are detected, the routine will
+ * clears all the AFSR sticky bits (except CP for
+ * spitfire/blackbird) and the UDBs.
+ * if ce_debug or ue_debug is set, log any ue/ce errors detected.
+ */
+static int
+check_ecc(struct async_flt *ecc)
+{
+	uint64_t t_afsr;
+	uint64_t t_afar;
+	uint64_t udbh;
+	uint64_t udbl;
+	ushort_t udb;
+	int persistent = 0;
+
+	/*
+	 * Capture the AFSR, AFAR and UDBs info
+	 */
+	get_asyncflt(&t_afsr);
+	get_asyncaddr(&t_afar);
+	t_afar &= SABRE_AFAR_PA;
+	get_udb_errors(&udbh, &udbl);
+
+	if ((t_afsr & P_AFSR_UE) || (t_afsr & P_AFSR_CE)) {
+		/*
+		 * Clear the errors
+		 */
+		clr_datapath();
+
+		if (isus2i || isus2e)
+			set_asyncflt(t_afsr);
+		else
+			set_asyncflt(t_afsr & ~P_AFSR_CP);
+
+		/*
+		 * determine whether to check UDBH or UDBL for persistence
+		 */
+		if (ecc->flt_synd & UDBL_REG) {
+			udb = (ushort_t)udbl;
+			t_afar |= 0x8;
+		} else {
+			udb = (ushort_t)udbh;
+		}
+
+		if (ce_debug || ue_debug) {
+			spitf_async_flt spf_flt; /* for logging */
+			struct async_flt *aflt =
+				(struct async_flt *)&spf_flt;
+
+			/* Package the info nicely in the spf_flt struct */
+			bzero(&spf_flt, sizeof (spitf_async_flt));
+			aflt->flt_stat = t_afsr;
+			aflt->flt_addr = t_afar;
+			spf_flt.flt_sdbh = (ushort_t)(udbh & 0x3FF);
+			spf_flt.flt_sdbl = (ushort_t)(udbl & 0x3FF);
+
+			cpu_aflt_log(CE_CONT, 0, &spf_flt, (CPU_AFSR |
+			    CPU_AFAR | CPU_UDBH | CPU_UDBL), NULL,
+			    " check_ecc: Dumping captured error states ...");
+		}
+
+		/*
+		 * if the fault addresses don't match, not persistent
+		 */
+		if (t_afar != ecc->flt_addr) {
+			return (persistent);
+		}
+
+		/*
+		 * check for UE persistence
+		 * since all DIMMs in the bank are identified for a UE,
+		 * there's no reason to check the syndrome
+		 */
+		if ((ecc->flt_stat & P_AFSR_UE) && (t_afsr & P_AFSR_UE)) {
+			persistent = 1;
+		}
+
+		/*
+		 * check for CE persistence
+		 */
+		if ((ecc->flt_stat & P_AFSR_CE) && (t_afsr & P_AFSR_CE)) {
+			if ((udb & P_DER_E_SYND) ==
+			    (ecc->flt_synd & P_DER_E_SYND)) {
+				persistent = 1;
+			}
+		}
+	}
+	return (persistent);
+}
+
+#ifdef HUMMINGBIRD
+#define	HB_FULL_DIV		1
+#define	HB_HALF_DIV		2
+#define	HB_LOWEST_DIV		8
+#define	HB_ECLK_INVALID		0xdeadbad
+static uint64_t hb_eclk[HB_LOWEST_DIV + 1] = {
+	HB_ECLK_INVALID, HB_ECLK_1, HB_ECLK_2, HB_ECLK_INVALID,
+	HB_ECLK_4, HB_ECLK_INVALID, HB_ECLK_6, HB_ECLK_INVALID,
+	HB_ECLK_8 };
+
+#define	HB_SLOW_DOWN		0
+#define	HB_SPEED_UP		1
+
+#define	SET_ESTAR_MODE(mode)					\
+	stdphysio(HB_ESTAR_MODE, (mode));			\
+	/*							\
+	 * PLL logic requires minimum of 16 clock		\
+	 * cycles to lock to the new clock speed.		\
+	 * Wait 1 usec to satisfy this requirement.		\
+	 */							\
+	drv_usecwait(1);
+
+#define	CHANGE_REFRESH_COUNT(direction, cur_div, new_div)	\
+{								\
+	volatile uint64_t data;					\
+	uint64_t count, new_count;				\
+	clock_t delay;						\
+	data = lddphysio(HB_MEM_CNTRL0);			\
+	count = (data & HB_REFRESH_COUNT_MASK) >> 		\
+	    HB_REFRESH_COUNT_SHIFT;				\
+	new_count = (HB_REFRESH_INTERVAL *			\
+	    cpunodes[CPU->cpu_id].clock_freq) /			\
+	    (HB_REFRESH_CLOCKS_PER_COUNT * (new_div) * NANOSEC);\
+	data = (data & ~HB_REFRESH_COUNT_MASK) |		\
+	    (new_count << HB_REFRESH_COUNT_SHIFT);		\
+	stdphysio(HB_MEM_CNTRL0, data);				\
+	data = lddphysio(HB_MEM_CNTRL0);        		\
+	/*							\
+	 * If we are slowing down the cpu and Memory		\
+	 * Self Refresh is not enabled, it is required		\
+	 * to wait for old refresh count to count-down and	\
+	 * new refresh count to go into effect (let new value	\
+	 * counts down once).					\
+	 */							\
+	if ((direction) == HB_SLOW_DOWN &&			\
+	    (data & HB_SELF_REFRESH_MASK) == 0) {		\
+		/*						\
+		 * Each count takes 64 cpu clock cycles		\
+		 * to decrement.  Wait for current refresh	\
+		 * count plus new refresh count at current	\
+		 * cpu speed to count down to zero.  Round	\
+		 * up the delay time.				\
+		 */						\
+		delay = ((HB_REFRESH_CLOCKS_PER_COUNT *		\
+		    (count + new_count) * MICROSEC * (cur_div)) /\
+		    cpunodes[CPU->cpu_id].clock_freq) + 1;	\
+		drv_usecwait(delay);				\
+	}							\
+}
+
+#define	SET_SELF_REFRESH(bit)					\
+{								\
+	volatile uint64_t data;					\
+	data = lddphysio(HB_MEM_CNTRL0);			\
+	data = (data & ~HB_SELF_REFRESH_MASK) |			\
+	    ((bit) << HB_SELF_REFRESH_SHIFT);			\
+	stdphysio(HB_MEM_CNTRL0, data);				\
+	data = lddphysio(HB_MEM_CNTRL0);			\
+}
+#endif	/* HUMMINGBIRD */
+
+/* ARGSUSED */
+void
+cpu_change_speed(uint64_t new_divisor, uint64_t arg2)
+{
+#ifdef HUMMINGBIRD
+	uint64_t cur_mask, cur_divisor = 0;
+	volatile uint64_t reg;
+	int index;
+
+	if ((new_divisor < HB_FULL_DIV || new_divisor > HB_LOWEST_DIV) ||
+	    (hb_eclk[new_divisor] == HB_ECLK_INVALID)) {
+		cmn_err(CE_WARN, "cpu_change_speed: bad divisor 0x%lx",
+		    new_divisor);
+		return;
+	}
+
+	reg = lddphysio(HB_ESTAR_MODE);
+	cur_mask = reg & HB_ECLK_MASK;
+	for (index = HB_FULL_DIV; index <= HB_LOWEST_DIV; index++) {
+		if (hb_eclk[index] == cur_mask) {
+			cur_divisor = index;
+			break;
+		}
+	}
+
+	if (cur_divisor == 0)
+		cmn_err(CE_PANIC, "cpu_change_speed: current divisor "
+		    "can't be determined!");
+
+	/*
+	 * If we are already at the requested divisor speed, just
+	 * return.
+	 */
+	if (cur_divisor == new_divisor)
+		return;
+
+	if (cur_divisor == HB_FULL_DIV && new_divisor == HB_HALF_DIV) {
+		CHANGE_REFRESH_COUNT(HB_SLOW_DOWN, cur_divisor, new_divisor);
+		SET_ESTAR_MODE(hb_eclk[new_divisor]);
+		SET_SELF_REFRESH(HB_SELF_REFRESH_ENABLE);
+
+	} else if (cur_divisor == HB_HALF_DIV && new_divisor == HB_FULL_DIV) {
+		SET_SELF_REFRESH(HB_SELF_REFRESH_DISABLE);
+		SET_ESTAR_MODE(hb_eclk[new_divisor]);
+		/* LINTED: E_FALSE_LOGICAL_EXPR */
+		CHANGE_REFRESH_COUNT(HB_SPEED_UP, cur_divisor, new_divisor);
+
+	} else if (cur_divisor == HB_FULL_DIV && new_divisor > HB_HALF_DIV) {
+		/*
+		 * Transition to 1/2 speed first, then to
+		 * lower speed.
+		 */
+		CHANGE_REFRESH_COUNT(HB_SLOW_DOWN, cur_divisor, HB_HALF_DIV);
+		SET_ESTAR_MODE(hb_eclk[HB_HALF_DIV]);
+		SET_SELF_REFRESH(HB_SELF_REFRESH_ENABLE);
+
+		CHANGE_REFRESH_COUNT(HB_SLOW_DOWN, HB_HALF_DIV, new_divisor);
+		SET_ESTAR_MODE(hb_eclk[new_divisor]);
+
+	} else if (cur_divisor > HB_HALF_DIV && new_divisor == HB_FULL_DIV) {
+		/*
+		 * Transition to 1/2 speed first, then to
+		 * full speed.
+		 */
+		SET_ESTAR_MODE(hb_eclk[HB_HALF_DIV]);
+		/* LINTED: E_FALSE_LOGICAL_EXPR */
+		CHANGE_REFRESH_COUNT(HB_SPEED_UP, cur_divisor, HB_HALF_DIV);
+
+		SET_SELF_REFRESH(HB_SELF_REFRESH_DISABLE);
+		SET_ESTAR_MODE(hb_eclk[new_divisor]);
+		/* LINTED: E_FALSE_LOGICAL_EXPR */
+		CHANGE_REFRESH_COUNT(HB_SPEED_UP, HB_HALF_DIV, new_divisor);
+
+	} else if (cur_divisor < new_divisor) {
+		CHANGE_REFRESH_COUNT(HB_SLOW_DOWN, cur_divisor, new_divisor);
+		SET_ESTAR_MODE(hb_eclk[new_divisor]);
+
+	} else if (cur_divisor > new_divisor) {
+		SET_ESTAR_MODE(hb_eclk[new_divisor]);
+		/* LINTED: E_FALSE_LOGICAL_EXPR */
+		CHANGE_REFRESH_COUNT(HB_SPEED_UP, cur_divisor, new_divisor);
+	}
+	CPU->cpu_m.divisor = (uchar_t)new_divisor;
+#endif
+}
+
+/*
+ * Clear the AFSR sticky bits and the UDBs. For Sabre/Spitfire/Blackbird,
+ * we clear all the sticky bits. If a non-null pointer to a async fault
+ * structure argument is passed in, the captured error state (AFSR, AFAR, UDBs)
+ * info will be returned in the structure.  If a non-null pointer to a
+ * uint64_t is passed in, this will be updated if the CP bit is set in the
+ * AFSR.  The afsr will be returned.
+ */
+static uint64_t
+clear_errors(spitf_async_flt *spf_flt, uint64_t *acc_afsr)
+{
+	struct async_flt *aflt = (struct async_flt *)spf_flt;
+	uint64_t afsr;
+	uint64_t udbh, udbl;
+
+	get_asyncflt(&afsr);
+
+	if ((acc_afsr != NULL) && (afsr & P_AFSR_CP))
+		*acc_afsr |= afsr;
+
+	if (spf_flt != NULL) {
+		aflt->flt_stat = afsr;
+		get_asyncaddr(&aflt->flt_addr);
+		aflt->flt_addr &= SABRE_AFAR_PA;
+
+		get_udb_errors(&udbh, &udbl);
+		spf_flt->flt_sdbh = (ushort_t)(udbh & 0x3FF);
+		spf_flt->flt_sdbl = (ushort_t)(udbl & 0x3FF);
+	}
+
+	set_asyncflt(afsr);		/* clear afsr */
+	clr_datapath();			/* clear udbs */
+	return (afsr);
+}
+
+/*
+ * Scan the ecache to look for bad lines.  If found, the afsr, afar, e$ data
+ * tag of the first bad line will be returned. We also return the old-afsr
+ * (before clearing the sticky bits). The linecnt data will be updated to
+ * indicate the number of bad lines detected.
+ */
+static void
+scan_ecache(uint64_t *t_afar, ec_data_t *ecache_data,
+	uint64_t *ecache_tag, int *linecnt, uint64_t *t_afsr)
+{
+	ec_data_t t_ecdata[8];
+	uint64_t t_etag, oafsr;
+	uint64_t pa = AFLT_INV_ADDR;
+	uint32_t i, j, ecache_sz;
+	uint64_t acc_afsr = 0;
+	uint64_t *cpu_afsr = NULL;
+
+	if (CPU_PRIVATE(CPU) != NULL)
+		cpu_afsr = CPU_PRIVATE_PTR(CPU, sfpr_scrub_afsr);
+
+	*linecnt = 0;
+	ecache_sz = cpunodes[CPU->cpu_id].ecache_size;
+
+	for (i = 0; i < ecache_sz; i += 64) {
+		get_ecache_dtag(i, (uint64_t *)&t_ecdata[0], &t_etag, &oafsr,
+		    cpu_afsr);
+		acc_afsr |= oafsr;
+
+		/*
+		 * Scan through the whole 64 bytes line in 8 8-byte chunks
+		 * looking for the first occurrence of an EDP error.  The AFSR
+		 * info is captured for each 8-byte chunk.  Note that for
+		 * Spitfire/Blackbird, the AFSR.PSYND is captured by h/w in
+		 * 16-byte chunk granularity (i.e. the AFSR will be the same
+		 * for the high and low 8-byte words within the 16-byte chunk).
+		 * For Sabre/Hummingbird, the AFSR.PSYND is captured in 8-byte
+		 * granularity and only PSYND bits [7:0] are used.
+		 */
+		for (j = 0; j < 8; j++) {
+			ec_data_t *ecdptr = &t_ecdata[j];
+
+			if (ecdptr->ec_afsr & P_AFSR_EDP) {
+				uint64_t errpa;
+				ushort_t psynd;
+				uint32_t ec_set_size = ecache_sz /
+				    ecache_associativity;
+
+				/*
+				 * For Spitfire/Blackbird, we need to look at
+				 * the PSYND to make sure that this 8-byte chunk
+				 * is the right one.  PSYND bits [15:8] belong
+				 * to the upper 8-byte (even) chunk.  Bits
+				 * [7:0] belong to the lower 8-byte chunk (odd).
+				 */
+				psynd = ecdptr->ec_afsr & P_AFSR_P_SYND;
+				if (!isus2i && !isus2e) {
+					if (j & 0x1)
+						psynd = psynd & 0xFF;
+					else
+						psynd = psynd >> 8;
+
+					if (!psynd)
+						continue; /* wrong chunk */
+				}
+
+				/* Construct the PA */
+				errpa = ((t_etag & cpu_ec_tag_mask) <<
+				    cpu_ec_tag_shift) | ((i | (j << 3)) %
+				    ec_set_size);
+
+				/* clean up the cache line */
+				flushecacheline(P2ALIGN(errpa, 64),
+					cpunodes[CPU->cpu_id].ecache_size);
+
+				oafsr = clear_errors(NULL, cpu_afsr);
+				acc_afsr |= oafsr;
+
+				(*linecnt)++;
+
+				/*
+				 * Capture the PA for the first bad line found.
+				 * Return the ecache dump and tag info.
+				 */
+				if (pa == AFLT_INV_ADDR) {
+					int k;
+
+					pa = errpa;
+					for (k = 0; k < 8; k++)
+						ecache_data[k] = t_ecdata[k];
+					*ecache_tag = t_etag;
+				}
+				break;
+			}
+		}
+	}
+	*t_afar = pa;
+	*t_afsr = acc_afsr;
+}
+
+static void
+cpu_log_ecmem_info(spitf_async_flt *spf_flt)
+{
+	struct async_flt *aflt = (struct async_flt *)spf_flt;
+	uint64_t ecache_tag = spf_flt->flt_ec_tag;
+	char linestr[30];
+	char *state_str;
+	int i;
+
+	/*
+	 * Check the ecache tag to make sure it
+	 * is valid. If invalid, a memory dump was
+	 * captured instead of a ecache dump.
+	 */
+	if (spf_flt->flt_ec_tag != AFLT_INV_ADDR) {
+		uchar_t eparity = (uchar_t)
+		    ((ecache_tag & cpu_ec_par_mask) >> cpu_ec_par_shift);
+
+		uchar_t estate = (uchar_t)
+		    ((ecache_tag & cpu_ec_state_mask) >> cpu_ec_state_shift);
+
+		if (estate == cpu_ec_state_shr)
+			state_str = "Shared";
+		else if (estate == cpu_ec_state_exl)
+			state_str = "Exclusive";
+		else if (estate == cpu_ec_state_own)
+			state_str = "Owner";
+		else if (estate == cpu_ec_state_mod)
+			state_str = "Modified";
+		else
+			state_str = "Invalid";
+
+		if (spf_flt->flt_ec_lcnt > 1) {
+			(void) snprintf(linestr, sizeof (linestr),
+			    "Badlines found=%d", spf_flt->flt_ec_lcnt);
+		} else {
+			linestr[0] = '\0';
+		}
+
+		cpu_aflt_log(CE_CONT, 2, spf_flt, CPU_ERRID_FIRST, NULL,
+		    " PA=0x%08x.%08x\n    E$tag 0x%08x.%08x E$State: %s "
+		    "E$parity 0x%02x %s", (uint32_t)(aflt->flt_addr >> 32),
+		    (uint32_t)aflt->flt_addr, (uint32_t)(ecache_tag >> 32),
+		    (uint32_t)ecache_tag, state_str,
+		    (uint32_t)eparity, linestr);
+	} else {
+		cpu_aflt_log(CE_CONT, 2, spf_flt, CPU_ERRID_FIRST, NULL,
+		    " E$tag != PA from AFAR; E$line was victimized"
+		    "\n    dumping memory from PA 0x%08x.%08x instead",
+		    (uint32_t)(P2ALIGN(aflt->flt_addr, 64) >> 32),
+		    (uint32_t)P2ALIGN(aflt->flt_addr, 64));
+	}
+
+	/*
+	 * Dump out all 8 8-byte ecache data captured
+	 * For each 8-byte data captured, we check the
+	 * captured afsr's parity syndrome to find out
+	 * which 8-byte chunk is bad. For memory dump, the
+	 * AFSR values were initialized to 0.
+	 */
+	for (i = 0; i < 8; i++) {
+		ec_data_t *ecdptr;
+		uint_t offset;
+		ushort_t psynd;
+		ushort_t bad;
+		uint64_t edp;
+
+		offset = i << 3;	/* multiply by 8 */
+		ecdptr = &spf_flt->flt_ec_data[i];
+		psynd = ecdptr->ec_afsr & P_AFSR_P_SYND;
+		edp = ecdptr->ec_afsr & P_AFSR_EDP;
+
+		/*
+		 * For Sabre/Hummingbird, parity synd is captured only
+		 * in [7:0] of AFSR.PSYND for each 8-byte chunk.
+		 * For spitfire/blackbird, AFSR.PSYND is captured
+		 * in 16-byte granularity. [15:8] represent
+		 * the upper 8 byte and [7:0] the lower 8 byte.
+		 */
+		if (isus2i || isus2e || (i & 0x1))
+			bad = (psynd & 0xFF);		/* check bits [7:0] */
+		else
+			bad = (psynd & 0xFF00);		/* check bits [15:8] */
+
+		if (bad && edp) {
+			cpu_aflt_log(CE_CONT, 2, spf_flt, NO_LFLAGS, NULL,
+			    " E$Data (0x%02x): 0x%08x.%08x "
+			    "*Bad* PSYND=0x%04x", offset,
+			    (uint32_t)(ecdptr->ec_d8 >> 32),
+			    (uint32_t)ecdptr->ec_d8, psynd);
+		} else {
+			cpu_aflt_log(CE_CONT, 2, spf_flt, NO_LFLAGS, NULL,
+			    " E$Data (0x%02x): 0x%08x.%08x", offset,
+			    (uint32_t)(ecdptr->ec_d8 >> 32),
+			    (uint32_t)ecdptr->ec_d8);
+		}
+	}
+}
+
+/*
+ * Common logging function for all cpu async errors.  This function allows the
+ * caller to generate a single cmn_err() call that logs the appropriate items
+ * from the fault structure, and implements our rules for AFT logging levels.
+ *
+ *	ce_code: cmn_err() code (e.g. CE_PANIC, CE_WARN, CE_CONT)
+ *	tagnum: 0, 1, 2, .. generate the [AFT#] tag
+ *	spflt: pointer to spitfire async fault structure
+ *	logflags: bitflags indicating what to output
+ *	endstr: a end string to appear at the end of this log
+ *	fmt: a format string to appear at the beginning of the log
+ *
+ * The logflags allows the construction of predetermined output from the spflt
+ * structure.  The individual data items always appear in a consistent order.
+ * Note that either or both of the spflt structure pointer and logflags may be
+ * NULL or zero respectively, indicating that the predetermined output
+ * substrings are not requested in this log.  The output looks like this:
+ *
+ *	[AFT#] <CPU_ERRID_FIRST><fmt string><CPU_FLTCPU>
+ *	<CPU_SPACE><CPU_ERRID>
+ *	newline+4spaces<CPU_AFSR><CPU_AFAR>
+ *	newline+4spaces<CPU_AF_PSYND><CPU_AF_ETS><CPU_FAULTPC>
+ *	newline+4spaces<CPU_UDBH><CPU_UDBL>
+ *	newline+4spaces<CPU_SYND>
+ *	newline+4spaces<endstr>
+ *
+ * Note that <endstr> may not start on a newline if we are logging <CPU_PSYND>;
+ * it is assumed that <endstr> will be the unum string in this case.  The size
+ * of our intermediate formatting buf[] is based on the worst case of all flags
+ * being enabled.  We pass the caller's varargs directly to vcmn_err() for
+ * formatting so we don't need additional stack space to format them here.
+ */
+/*PRINTFLIKE6*/
+static void
+cpu_aflt_log(int ce_code, int tagnum, spitf_async_flt *spflt, uint_t logflags,
+	const char *endstr, const char *fmt, ...)
+{
+	struct async_flt *aflt = (struct async_flt *)spflt;
+	char buf[400], *p, *q; /* see comments about buf[] size above */
+	va_list ap;
+	int console_log_flag;
+
+	if ((aflt == NULL) || ((aflt->flt_class == CPU_FAULT) &&
+				(aflt->flt_stat & P_AFSR_LEVEL1)) ||
+	    (aflt->flt_panic)) {
+		console_log_flag = (tagnum < 2) || aft_verbose;
+	} else {
+		int verbose = ((aflt->flt_class == BUS_FAULT) ||
+		    (aflt->flt_stat & P_AFSR_CE)) ?
+		    ce_verbose_memory : ce_verbose_other;
+
+		if (!verbose)
+			return;
+
+		console_log_flag = (verbose > 1);
+	}
+
+	if (console_log_flag)
+		(void) sprintf(buf, "[AFT%d]", tagnum);
+	else
+		(void) sprintf(buf, "![AFT%d]", tagnum);
+
+	p = buf + strlen(buf);	/* current buffer position */
+	q = buf + sizeof (buf);	/* pointer past end of buffer */
+
+	if (spflt != NULL && (logflags & CPU_ERRID_FIRST)) {
+		(void) snprintf(p, (size_t)(q - p), " errID 0x%08x.%08x",
+		    (uint32_t)(aflt->flt_id >> 32), (uint32_t)aflt->flt_id);
+		p += strlen(p);
+	}
+
+	/*
+	 * Copy the caller's format string verbatim into buf[].  It will be
+	 * formatted by the call to vcmn_err() at the end of this function.
+	 */
+	if (fmt != NULL && p < q) {
+		(void) strncpy(p, fmt, (size_t)(q - p - 1));
+		buf[sizeof (buf) - 1] = '\0';
+		p += strlen(p);
+	}
+
+	if (spflt != NULL) {
+		if (logflags & CPU_FLTCPU) {
+			(void) snprintf(p, (size_t)(q - p), " CPU%d",
+			    aflt->flt_inst);
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_SPACE) {
+			if (aflt->flt_status & ECC_D_TRAP)
+				(void) snprintf(p, (size_t)(q - p),
+				    " Data access");
+			else if (aflt->flt_status & ECC_I_TRAP)
+				(void) snprintf(p, (size_t)(q - p),
+				    " Instruction access");
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_TL) {
+			(void) snprintf(p, (size_t)(q - p), " at TL%s",
+			    aflt->flt_tl ? ">0" : "=0");
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_ERRID) {
+			(void) snprintf(p, (size_t)(q - p),
+			    ", errID 0x%08x.%08x",
+			    (uint32_t)(aflt->flt_id >> 32),
+			    (uint32_t)aflt->flt_id);
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_AFSR) {
+			(void) snprintf(p, (size_t)(q - p),
+			    "\n    AFSR 0x%08b.%08b",
+			    (uint32_t)(aflt->flt_stat >> 32), AFSR_FMTSTR0,
+			    (uint32_t)aflt->flt_stat, AFSR_FMTSTR1);
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_AFAR) {
+			(void) snprintf(p, (size_t)(q - p), " AFAR 0x%08x.%08x",
+			    (uint32_t)(aflt->flt_addr >> 32),
+			    (uint32_t)aflt->flt_addr);
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_AF_PSYND) {
+			ushort_t psynd = (ushort_t)
+			    (aflt->flt_stat & P_AFSR_P_SYND);
+
+			(void) snprintf(p, (size_t)(q - p),
+			    "\n    AFSR.PSYND 0x%04x(Score %02d)",
+			    psynd, ecc_psynd_score(psynd));
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_AF_ETS) {
+			(void) snprintf(p, (size_t)(q - p), " AFSR.ETS 0x%02x",
+			    (uchar_t)((aflt->flt_stat & P_AFSR_ETS) >> 16));
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_FAULTPC) {
+			(void) snprintf(p, (size_t)(q - p), " Fault_PC 0x%p",
+			    (void *)aflt->flt_pc);
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_UDBH) {
+			(void) snprintf(p, (size_t)(q - p),
+			    "\n    UDBH 0x%04b UDBH.ESYND 0x%02x",
+			    spflt->flt_sdbh, UDB_FMTSTR,
+			    spflt->flt_sdbh & 0xFF);
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_UDBL) {
+			(void) snprintf(p, (size_t)(q - p),
+			    " UDBL 0x%04b UDBL.ESYND 0x%02x",
+			    spflt->flt_sdbl, UDB_FMTSTR,
+			    spflt->flt_sdbl & 0xFF);
+			p += strlen(p);
+		}
+
+		if (logflags & CPU_SYND) {
+			ushort_t synd = SYND(aflt->flt_synd);
+
+			(void) snprintf(p, (size_t)(q - p),
+			    "\n    %s Syndrome 0x%x Memory Module ",
+			    UDBL(aflt->flt_synd) ? "UDBL" : "UDBH", synd);
+			p += strlen(p);
+		}
+	}
+
+	if (endstr != NULL) {
+		if (!(logflags & CPU_SYND))
+			(void) snprintf(p, (size_t)(q - p), "\n    %s", endstr);
+		else
+			(void) snprintf(p, (size_t)(q - p), "%s", endstr);
+		p += strlen(p);
+	}
+
+	if (ce_code == CE_CONT && (p < q - 1))
+		(void) strcpy(p, "\n"); /* add final \n if needed */
+
+	va_start(ap, fmt);
+	vcmn_err(ce_code, buf, ap);
+	va_end(ap);
+}
+
+/*
+ * Ecache Scrubbing
+ *
+ * The basic idea is to prevent lines from sitting in the ecache long enough
+ * to build up soft errors which can lead to ecache parity errors.
+ *
+ * The following rules are observed when flushing the ecache:
+ *
+ * 1. When the system is busy, flush bad clean lines
+ * 2. When the system is idle, flush all clean lines
+ * 3. When the system is idle, flush good dirty lines
+ * 4. Never flush bad dirty lines.
+ *
+ *	modify	parity	busy   idle
+ *	----------------------------
+ *	clean	good		X
+ * 	clean	bad	X	X
+ * 	dirty	good		X
+ *	dirty	bad
+ *
+ * Bad or good refers to whether a line has an E$ parity error or not.
+ * Clean or dirty refers to the state of the modified bit.  We currently
+ * default the scan rate to 100 (scan 10% of the cache per second).
+ *
+ * The following are E$ states and actions.
+ *
+ * We encode our state as a 3-bit number, consisting of:
+ *	ECACHE_STATE_MODIFIED	(0=clean, 1=dirty)
+ *	ECACHE_STATE_PARITY	(0=good,  1=bad)
+ *	ECACHE_STATE_BUSY	(0=idle,  1=busy)
+ *
+ * We associate a flushing and a logging action with each state.
+ *
+ * E$ actions are different for Spitfire and Sabre/Hummingbird modules.
+ * MIRROR_FLUSH indicates that an E$ line will be flushed for the mirrored
+ * E$ only, in addition to value being set by ec_flush.
+ */
+
+#define	ALWAYS_FLUSH		0x1	/* flush E$ line on all E$ types */
+#define	NEVER_FLUSH		0x0	/* never the flush the E$ line */
+#define	MIRROR_FLUSH		0xF	/* flush E$ line on mirrored E$ only */
+
+struct {
+	char	ec_flush;		/* whether to flush or not */
+	char	ec_log;			/* ecache logging */
+	char	ec_log_type;		/* log type info */
+} ec_action[] = {	/* states of the E$ line in M P B */
+	{ ALWAYS_FLUSH, 0, 0 },			 /* 0 0 0 clean_good_idle */
+	{ MIRROR_FLUSH, 0, 0 },			 /* 0 0 1 clean_good_busy */
+	{ ALWAYS_FLUSH, 1, CPU_BADLINE_CI_ERR }, /* 0 1 0 clean_bad_idle */
+	{ ALWAYS_FLUSH, 1, CPU_BADLINE_CB_ERR }, /* 0 1 1 clean_bad_busy */
+	{ ALWAYS_FLUSH, 0, 0 },			 /* 1 0 0 dirty_good_idle */
+	{ MIRROR_FLUSH, 0, 0 },			 /* 1 0 1 dirty_good_busy */
+	{ NEVER_FLUSH, 1, CPU_BADLINE_DI_ERR },	 /* 1 1 0 dirty_bad_idle */
+	{ NEVER_FLUSH, 1, CPU_BADLINE_DB_ERR }	 /* 1 1 1 dirty_bad_busy */
+};
+
+/*
+ * Offsets into the ec_action[] that determines clean_good_busy and
+ * dirty_good_busy lines.
+ */
+#define	ECACHE_CGB_LINE		1	/* E$ clean_good_busy line */
+#define	ECACHE_DGB_LINE		5	/* E$ dirty_good_busy line */
+
+/*
+ * We are flushing lines which are Clean_Good_Busy and also the lines
+ * Dirty_Good_Busy. And we only follow it for non-mirrored E$.
+ */
+#define	CGB(x, m)	(((x) == ECACHE_CGB_LINE) && (m != ECACHE_CPU_MIRROR))
+#define	DGB(x, m)	(((x) == ECACHE_DGB_LINE) && (m != ECACHE_CPU_MIRROR))
+
+#define	ECACHE_STATE_MODIFIED	0x4
+#define	ECACHE_STATE_PARITY	0x2
+#define	ECACHE_STATE_BUSY	0x1
+
+/*
+ * If ecache is mirrored ecache_calls_a_sec and ecache_scan_rate are reduced.
+ */
+int ecache_calls_a_sec_mirrored = 1;
+int ecache_lines_per_call_mirrored = 1;
+
+int ecache_scrub_enable = 1;	/* ecache scrubbing is on by default */
+int ecache_scrub_verbose = 1;		/* prints clean and dirty lines */
+int ecache_scrub_panic = 0;		/* panics on a clean and dirty line */
+int ecache_calls_a_sec = 100;		/* scrubber calls per sec */
+int ecache_scan_rate = 100;		/* scan rate (in tenths of a percent) */
+int ecache_idle_factor = 1;		/* increase the scan rate when idle */
+int ecache_flush_clean_good_busy = 50;	/* flush rate (in percent) */
+int ecache_flush_dirty_good_busy = 100;	/* flush rate (in percent) */
+
+volatile int ec_timeout_calls = 1;	/* timeout calls */
+
+/*
+ * Interrupt number and pil for ecache scrubber cross-trap calls.
+ */
+static uint_t ecache_scrub_inum;
+uint_t ecache_scrub_pil = PIL_9;
+
+/*
+ * Kstats for the E$ scrubber.
+ */
+typedef struct ecache_kstat {
+	kstat_named_t clean_good_idle;		/* # of lines scrubbed */
+	kstat_named_t clean_good_busy;		/* # of lines skipped */
+	kstat_named_t clean_bad_idle;		/* # of lines scrubbed */
+	kstat_named_t clean_bad_busy;		/* # of lines scrubbed */
+	kstat_named_t dirty_good_idle;		/* # of lines scrubbed */
+	kstat_named_t dirty_good_busy;		/* # of lines skipped */
+	kstat_named_t dirty_bad_idle;		/* # of lines skipped */
+	kstat_named_t dirty_bad_busy;		/* # of lines skipped */
+	kstat_named_t invalid_lines;		/* # of invalid lines */
+	kstat_named_t clean_good_busy_flush;    /* # of lines scrubbed */
+	kstat_named_t dirty_good_busy_flush;    /* # of lines scrubbed */
+	kstat_named_t tags_cleared;		/* # of E$ tags cleared */
+} ecache_kstat_t;
+
+static ecache_kstat_t ec_kstat_template = {
+	{ "clean_good_idle", KSTAT_DATA_ULONG },
+	{ "clean_good_busy", KSTAT_DATA_ULONG },
+	{ "clean_bad_idle", KSTAT_DATA_ULONG },
+	{ "clean_bad_busy", KSTAT_DATA_ULONG },
+	{ "dirty_good_idle", KSTAT_DATA_ULONG },
+	{ "dirty_good_busy", KSTAT_DATA_ULONG },
+	{ "dirty_bad_idle", KSTAT_DATA_ULONG },
+	{ "dirty_bad_busy", KSTAT_DATA_ULONG },
+	{ "invalid_lines", KSTAT_DATA_ULONG },
+	{ "clean_good_busy_flush", KSTAT_DATA_ULONG },
+	{ "dirty_good_busy_flush", KSTAT_DATA_ULONG },
+	{ "ecache_tags_cleared", KSTAT_DATA_ULONG }
+};
+
+struct kmem_cache *sf_private_cache;
+
+/*
+ * Called periodically on each CPU to scan the ecache once a sec.
+ * adjusting the ecache line index appropriately
+ */
+void
+scrub_ecache_line()
+{
+	spitfire_scrub_misc_t *ssmp = CPU_PRIVATE_PTR(CPU, sfpr_scrub_misc);
+	int cpuid = CPU->cpu_id;
+	uint32_t index = ssmp->ecache_flush_index;
+	uint64_t ec_size = cpunodes[cpuid].ecache_size;
+	size_t ec_linesize = cpunodes[cpuid].ecache_linesize;
+	int nlines = ssmp->ecache_nlines;
+	uint32_t ec_set_size = ec_size / ecache_associativity;
+	int ec_mirror = ssmp->ecache_mirror;
+	ecache_kstat_t *ec_ksp = (ecache_kstat_t *)ssmp->ecache_ksp->ks_data;
+
+	int line, scan_lines, flush_clean_busy = 0, flush_dirty_busy = 0;
+	int mpb;		/* encode Modified, Parity, Busy for action */
+	uchar_t state;
+	uint64_t ec_tag, paddr, oafsr, tafsr, nafsr;
+	uint64_t *acc_afsr = CPU_PRIVATE_PTR(CPU, sfpr_scrub_afsr);
+	ec_data_t ec_data[8];
+	kstat_named_t *ec_knp;
+
+	switch (ec_mirror) {
+		default:
+		case ECACHE_CPU_NON_MIRROR:
+			/*
+			 * The E$ scan rate is expressed in units of tenths of
+			 * a percent.  ecache_scan_rate = 1000 (100%) means the
+			 * whole cache is scanned every second.
+			 */
+			scan_lines = (nlines * ecache_scan_rate) /
+					(1000 * ecache_calls_a_sec);
+			if (!(ssmp->ecache_busy)) {
+				if (ecache_idle_factor > 0) {
+					scan_lines *= ecache_idle_factor;
+				}
+			} else {
+				flush_clean_busy = (scan_lines *
+					ecache_flush_clean_good_busy) / 100;
+				flush_dirty_busy = (scan_lines *
+					ecache_flush_dirty_good_busy) / 100;
+			}
+
+			ec_timeout_calls = (ecache_calls_a_sec ?
+						ecache_calls_a_sec : 1);
+			break;
+
+		case ECACHE_CPU_MIRROR:
+			scan_lines = ecache_lines_per_call_mirrored;
+			ec_timeout_calls = (ecache_calls_a_sec_mirrored ?
+					ecache_calls_a_sec_mirrored : 1);
+			break;
+	}
+
+	/*
+	 * The ecache scrubber algorithm operates by reading and
+	 * decoding the E$ tag to determine whether the corresponding E$ line
+	 * can be scrubbed. There is a implicit assumption in the scrubber
+	 * logic that the E$ tag is valid. Unfortunately, this assertion is
+	 * flawed since the E$ tag may also be corrupted and have parity errors
+	 * The scrubber logic is enhanced to check the validity of the E$ tag
+	 * before scrubbing. When a parity error is detected in the E$ tag,
+	 * it is possible to recover and scrub the tag under certain conditions
+	 * so that a ETP error condition can be avoided.
+	 */
+
+	for (mpb = line = 0; line < scan_lines; line++, mpb = 0) {
+		/*
+		 * We get the old-AFSR before clearing the AFSR sticky bits
+		 * in {get_ecache_tag, check_ecache_line, get_ecache_dtag}
+		 * If CP bit is set in the old-AFSR, we log an Orphan CP event.
+		 */
+		ec_tag = get_ecache_tag(index, &nafsr, acc_afsr);
+		state = (uchar_t)((ec_tag & cpu_ec_state_mask) >>
+				cpu_ec_state_shift);
+
+		/*
+		 * ETP is set try to scrub the ecache tag.
+		 */
+		if (nafsr & P_AFSR_ETP) {
+			ecache_scrub_tag_err(nafsr, state, index);
+		} else if (state & cpu_ec_state_valid) {
+			/*
+			 * ETP is not set, E$ tag is valid.
+			 * Proceed with the E$ scrubbing.
+			 */
+			if (state & cpu_ec_state_dirty)
+				mpb |= ECACHE_STATE_MODIFIED;
+
+			tafsr = check_ecache_line(index, acc_afsr);
+
+			if (tafsr & P_AFSR_EDP) {
+				mpb |= ECACHE_STATE_PARITY;
+
+				if (ecache_scrub_verbose ||
+							ecache_scrub_panic) {
+					get_ecache_dtag(P2ALIGN(index, 64),
+						(uint64_t *)&ec_data[0],
+						&ec_tag, &oafsr, acc_afsr);
+				}
+			}
+
+			if (ssmp->ecache_busy)
+				mpb |= ECACHE_STATE_BUSY;
+
+			ec_knp = (kstat_named_t *)ec_ksp + mpb;
+			ec_knp->value.ul++;
+
+			paddr = ((ec_tag & cpu_ec_tag_mask) <<
+				cpu_ec_tag_shift) | (index % ec_set_size);
+
+			/*
+			 * We flush the E$ lines depending on the ec_flush,
+			 * we additionally flush clean_good_busy and
+			 * dirty_good_busy lines for mirrored E$.
+			 */
+			if (ec_action[mpb].ec_flush == ALWAYS_FLUSH) {
+				flushecacheline(paddr, ec_size);
+			} else if ((ec_mirror == ECACHE_CPU_MIRROR) &&
+				(ec_action[mpb].ec_flush == MIRROR_FLUSH)) {
+					flushecacheline(paddr, ec_size);
+			} else if (ec_action[mpb].ec_flush == NEVER_FLUSH) {
+				softcall(ecache_page_retire, (void *)paddr);
+			}
+
+			/*
+			 * Conditionally flush both the clean_good and
+			 * dirty_good lines when busy.
+			 */
+			if (CGB(mpb, ec_mirror) && (flush_clean_busy > 0)) {
+				flush_clean_busy--;
+				flushecacheline(paddr, ec_size);
+				ec_ksp->clean_good_busy_flush.value.ul++;
+			} else if (DGB(mpb, ec_mirror) &&
+						(flush_dirty_busy > 0)) {
+				flush_dirty_busy--;
+				flushecacheline(paddr, ec_size);
+				ec_ksp->dirty_good_busy_flush.value.ul++;
+			}
+
+			if (ec_action[mpb].ec_log && (ecache_scrub_verbose ||
+						ecache_scrub_panic)) {
+				ecache_scrub_log(ec_data, ec_tag, paddr, mpb,
+						tafsr);
+			}
+
+		} else {
+			ec_ksp->invalid_lines.value.ul++;
+		}
+
+		if ((index += ec_linesize) >= ec_size)
+			index = 0;
+
+	}
+
+	/*
+	 * set the ecache scrub index for the next time around
+	 */
+	ssmp->ecache_flush_index = index;
+
+	if (*acc_afsr & P_AFSR_CP) {
+		uint64_t ret_afsr;
+
+		ret_afsr = ecache_scrub_misc_err(CPU_ORPHAN_CP_ERR, *acc_afsr);
+		if ((ret_afsr & P_AFSR_CP) == 0)
+			*acc_afsr = 0;
+	}
+}
+
+/*
+ * Handler for ecache_scrub_inum softint.  Call scrub_ecache_line until
+ * we decrement the outstanding request count to zero.
+ */
+
+/*ARGSUSED*/
+uint_t
+scrub_ecache_line_intr(caddr_t arg1, caddr_t arg2)
+{
+	int i;
+	int outstanding;
+	spitfire_scrub_misc_t *ssmp = CPU_PRIVATE_PTR(CPU, sfpr_scrub_misc);
+	uint32_t *countp = &ssmp->ec_scrub_outstanding;
+
+	do {
+		outstanding = *countp;
+		ASSERT(outstanding > 0);
+		for (i = 0; i < outstanding; i++)
+			scrub_ecache_line();
+	} while (atomic_add_32_nv(countp, -outstanding));
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * force each cpu to perform an ecache scrub, called from a timeout
+ */
+extern xcfunc_t ecache_scrubreq_tl1;
+
+void
+do_scrub_ecache_line(void)
+{
+	long delta;
+
+	if (ecache_calls_a_sec > hz)
+		ecache_calls_a_sec = hz;
+	else if (ecache_calls_a_sec <= 0)
+	    ecache_calls_a_sec = 1;
+
+	if (ecache_calls_a_sec_mirrored > hz)
+		ecache_calls_a_sec_mirrored = hz;
+	else if (ecache_calls_a_sec_mirrored <= 0)
+	    ecache_calls_a_sec_mirrored = 1;
+
+	if (ecache_scrub_enable) {
+		xt_all(ecache_scrubreq_tl1, ecache_scrub_inum, 0);
+		delta = hz / ec_timeout_calls;
+	} else {
+		delta = hz;
+	}
+
+	(void) realtime_timeout((void(*)(void *))do_scrub_ecache_line, 0,
+		delta);
+}
+
+/*
+ * initialization for ecache scrubbing
+ * This routine is called AFTER all cpus have had cpu_init_private called
+ * to initialize their private data areas.
+ */
+void
+cpu_init_cache_scrub(void)
+{
+	if (ecache_calls_a_sec > hz) {
+		cmn_err(CE_NOTE, "ecache_calls_a_sec set too high (%d); "
+		    "resetting to hz (%d)", ecache_calls_a_sec, hz);
+		ecache_calls_a_sec = hz;
+	}
+
+	/*
+	 * Register softint for ecache scrubbing.
+	 */
+	ecache_scrub_inum = add_softintr(ecache_scrub_pil,
+	    scrub_ecache_line_intr, NULL);
+
+	/*
+	 * kick off the scrubbing using realtime timeout
+	 */
+	(void) realtime_timeout((void(*)(void *))do_scrub_ecache_line, 0,
+	    hz / ecache_calls_a_sec);
+}
+
+/*
+ * Unset the busy flag for this cpu.
+ */
+void
+cpu_idle_ecache_scrub(struct cpu *cp)
+{
+	if (CPU_PRIVATE(cp) != NULL) {
+		spitfire_scrub_misc_t *ssmp = CPU_PRIVATE_PTR(cp,
+							sfpr_scrub_misc);
+		ssmp->ecache_busy = ECACHE_CPU_IDLE;
+	}
+}
+
+/*
+ * Set the busy flag for this cpu.
+ */
+void
+cpu_busy_ecache_scrub(struct cpu *cp)
+{
+	if (CPU_PRIVATE(cp) != NULL) {
+		spitfire_scrub_misc_t *ssmp = CPU_PRIVATE_PTR(cp,
+							sfpr_scrub_misc);
+		ssmp->ecache_busy = ECACHE_CPU_BUSY;
+	}
+}
+
+/*
+ * initialize the ecache scrubber data structures
+ * The global entry point cpu_init_private replaces this entry point.
+ *
+ */
+static void
+cpu_init_ecache_scrub_dr(struct cpu *cp)
+{
+	spitfire_scrub_misc_t *ssmp = CPU_PRIVATE_PTR(cp, sfpr_scrub_misc);
+	int cpuid = cp->cpu_id;
+
+	/*
+	 * intialize bookkeeping for cache scrubbing
+	 */
+	bzero(ssmp, sizeof (spitfire_scrub_misc_t));
+
+	ssmp->ecache_flush_index = 0;
+
+	ssmp->ecache_nlines =
+		cpunodes[cpuid].ecache_size / cpunodes[cpuid].ecache_linesize;
+
+	/*
+	 * Determine whether we are running on mirrored SRAM
+	 */
+
+	if (cpunodes[cpuid].msram == ECACHE_CPU_MIRROR)
+		ssmp->ecache_mirror = ECACHE_CPU_MIRROR;
+	else
+		ssmp->ecache_mirror = ECACHE_CPU_NON_MIRROR;
+
+	cpu_busy_ecache_scrub(cp);
+
+	/*
+	 * initialize the kstats
+	 */
+	ecache_kstat_init(cp);
+}
+
+/*
+ * uninitialize the ecache scrubber data structures
+ * The global entry point cpu_uninit_private replaces this entry point.
+ */
+static void
+cpu_uninit_ecache_scrub_dr(struct cpu *cp)
+{
+	spitfire_scrub_misc_t *ssmp = CPU_PRIVATE_PTR(cp, sfpr_scrub_misc);
+
+	if (ssmp->ecache_ksp != NULL) {
+		kstat_delete(ssmp->ecache_ksp);
+		ssmp->ecache_ksp = NULL;
+	}
+
+	/*
+	 * un-initialize bookkeeping for cache scrubbing
+	 */
+	bzero(ssmp, sizeof (spitfire_scrub_misc_t));
+
+	cpu_idle_ecache_scrub(cp);
+}
+
+struct kmem_cache *sf_private_cache;
+
+/*
+ * Cpu private initialization.  This includes allocating the cpu_private
+ * data structure, initializing it, and initializing the scrubber for this
+ * cpu.  This is called once for EVERY cpu, including CPU 0. This function
+ * calls cpu_init_ecache_scrub_dr to init the scrubber.
+ * We use kmem_cache_create for the spitfire private data structure because it
+ * needs to be allocated on a S_ECACHE_MAX_LSIZE (64) byte boundary.
+ */
+void
+cpu_init_private(struct cpu *cp)
+{
+	spitfire_private_t *sfprp;
+
+	ASSERT(CPU_PRIVATE(cp) == NULL);
+
+	/*
+	 * If the sf_private_cache has not been created, create it.
+	 */
+	if (sf_private_cache == NULL) {
+		sf_private_cache = kmem_cache_create("sf_private_cache",
+			sizeof (spitfire_private_t), S_ECACHE_MAX_LSIZE, NULL,
+			NULL, NULL, NULL, NULL, 0);
+		ASSERT(sf_private_cache);
+	}
+
+	sfprp = CPU_PRIVATE(cp) = kmem_cache_alloc(sf_private_cache, KM_SLEEP);
+
+	bzero(sfprp, sizeof (spitfire_private_t));
+
+	cpu_init_ecache_scrub_dr(cp);
+}
+
+/*
+ * Cpu private unitialization.  Uninitialize the Ecache scrubber and
+ * deallocate the scrubber data structures and cpu_private data structure.
+ * For now, this function just calls cpu_unint_ecache_scrub_dr to uninit
+ * the scrubber for the specified cpu.
+ */
+void
+cpu_uninit_private(struct cpu *cp)
+{
+	ASSERT(CPU_PRIVATE(cp));
+
+	cpu_uninit_ecache_scrub_dr(cp);
+	kmem_cache_free(sf_private_cache, CPU_PRIVATE(cp));
+	CPU_PRIVATE(cp) = NULL;
+}
+
+/*
+ * initialize the ecache kstats for each cpu
+ */
+static void
+ecache_kstat_init(struct cpu *cp)
+{
+	struct kstat *ksp;
+	spitfire_scrub_misc_t *ssmp = CPU_PRIVATE_PTR(cp, sfpr_scrub_misc);
+
+	ASSERT(ssmp != NULL);
+
+	if ((ksp = kstat_create("unix", cp->cpu_id, "ecache_kstat", "misc",
+	    KSTAT_TYPE_NAMED,
+	    sizeof (ecache_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_WRITABLE)) == NULL) {
+		ssmp->ecache_ksp = NULL;
+		cmn_err(CE_NOTE, "!ecache_kstat_init(%d) failed\n", cp->cpu_id);
+		return;
+	}
+
+	ssmp->ecache_ksp = ksp;
+	bcopy(&ec_kstat_template, ksp->ks_data, sizeof (ecache_kstat_t));
+	kstat_install(ksp);
+}
+
+/*
+ * log the bad ecache information
+ */
+static void
+ecache_scrub_log(ec_data_t *ec_data, uint64_t ec_tag, uint64_t paddr, int mpb,
+		uint64_t afsr)
+{
+	spitf_async_flt spf_flt;
+	struct async_flt *aflt;
+	int i;
+	char *class;
+
+	bzero(&spf_flt, sizeof (spitf_async_flt));
+	aflt = &spf_flt.cmn_asyncflt;
+
+	for (i = 0; i < 8; i++) {
+		spf_flt.flt_ec_data[i] = ec_data[i];
+	}
+
+	spf_flt.flt_ec_tag = ec_tag;
+
+	if (mpb < (sizeof (ec_action) / sizeof (ec_action[0]))) {
+		spf_flt.flt_type = ec_action[mpb].ec_log_type;
+	} else spf_flt.flt_type = (ushort_t)mpb;
+
+	aflt->flt_inst = CPU->cpu_id;
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_id = gethrtime_waitfree();
+	aflt->flt_addr = paddr;
+	aflt->flt_stat = afsr;
+	aflt->flt_panic = (uchar_t)ecache_scrub_panic;
+
+	switch (mpb) {
+	case CPU_ECACHE_TAG_ERR:
+	case CPU_ECACHE_ADDR_PAR_ERR:
+	case CPU_ECACHE_ETP_ETS_ERR:
+	case CPU_ECACHE_STATE_ERR:
+		class = FM_EREPORT_CPU_USII_ESCRUB_TAG;
+		break;
+	default:
+		class = FM_EREPORT_CPU_USII_ESCRUB_DATA;
+		break;
+	}
+
+	cpu_errorq_dispatch(class, (void *)&spf_flt, sizeof (spf_flt),
+	    ue_queue, aflt->flt_panic);
+
+	if (aflt->flt_panic)
+		cmn_err(CE_PANIC, "ecache_scrub_panic set and bad E$"
+					"line detected");
+}
+
+/*
+ * Process an ecache error that occured during the E$ scrubbing.
+ * We do the ecache scan to find the bad line, flush the bad line
+ * and start the memscrubber to find any UE (in memory or in another cache)
+ */
+static uint64_t
+ecache_scrub_misc_err(int type, uint64_t afsr)
+{
+	spitf_async_flt spf_flt;
+	struct async_flt *aflt;
+	uint64_t oafsr;
+
+	bzero(&spf_flt, sizeof (spitf_async_flt));
+	aflt = &spf_flt.cmn_asyncflt;
+
+	/*
+	 * Scan each line in the cache to look for the one
+	 * with bad parity
+	 */
+	aflt->flt_addr = AFLT_INV_ADDR;
+	scan_ecache(&aflt->flt_addr, &spf_flt.flt_ec_data[0],
+		&spf_flt.flt_ec_tag, &spf_flt.flt_ec_lcnt, &oafsr);
+
+	if (oafsr & P_AFSR_CP) {
+		uint64_t *cp_afsr = CPU_PRIVATE_PTR(CPU, sfpr_scrub_afsr);
+		*cp_afsr |= oafsr;
+	}
+
+	/*
+	 * If we found a bad PA, update the state to indicate if it is
+	 * memory or I/O space.
+	 */
+	if (aflt->flt_addr != AFLT_INV_ADDR) {
+		aflt->flt_in_memory = (pf_is_memory(aflt->flt_addr >>
+			MMU_PAGESHIFT)) ? 1 : 0;
+	}
+
+	spf_flt.flt_type = (ushort_t)type;
+
+	aflt->flt_inst = CPU->cpu_id;
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_id = gethrtime_waitfree();
+	aflt->flt_status = afsr;
+	aflt->flt_panic = (uchar_t)ecache_scrub_panic;
+
+	/*
+	 * We have the bad line, flush that line and start
+	 * the memscrubber.
+	 */
+	if (spf_flt.flt_ec_lcnt > 0) {
+		flushecacheline(P2ALIGN(aflt->flt_addr, 64),
+			cpunodes[CPU->cpu_id].ecache_size);
+		read_all_memscrub = 1;
+		memscrub_run();
+	}
+
+	cpu_errorq_dispatch((type == CPU_ORPHAN_CP_ERR) ?
+	    FM_EREPORT_CPU_USII_CP : FM_EREPORT_CPU_USII_UNKNOWN,
+	    (void *)&spf_flt, sizeof (spf_flt), ue_queue, aflt->flt_panic);
+
+	return (oafsr);
+}
+
+static void
+ecache_scrub_tag_err(uint64_t afsr, uchar_t state, uint32_t index)
+{
+	ushort_t afsr_ets = (afsr & P_AFSR_ETS) >> P_AFSR_ETS_SHIFT;
+	spitfire_scrub_misc_t *ssmp = CPU_PRIVATE_PTR(CPU, sfpr_scrub_misc);
+	ecache_kstat_t *ec_ksp = (ecache_kstat_t *)ssmp->ecache_ksp->ks_data;
+	uint64_t ec_tag, paddr, oafsr;
+	ec_data_t ec_data[8];
+	int cpuid = CPU->cpu_id;
+	uint32_t ec_set_size = cpunodes[cpuid].ecache_size /
+						ecache_associativity;
+	uint64_t *cpu_afsr = CPU_PRIVATE_PTR(CPU, sfpr_scrub_afsr);
+
+	get_ecache_dtag(P2ALIGN(index, 64), (uint64_t *)&ec_data[0], &ec_tag,
+			&oafsr, cpu_afsr);
+	paddr = ((ec_tag & cpu_ec_tag_mask) << cpu_ec_tag_shift) |
+						(index % ec_set_size);
+
+	/*
+	 * E$ tag state has good parity
+	 */
+	if ((afsr_ets & cpu_ec_state_parity) == 0) {
+		if (afsr_ets & cpu_ec_parity) {
+			/*
+			 * E$ tag state bits indicate the line is clean,
+			 * invalidate the E$ tag and continue.
+			 */
+			if (!(state & cpu_ec_state_dirty)) {
+				/*
+				 * Zero the tag and mark the state invalid
+				 * with good parity for the tag.
+				 */
+				if (isus2i || isus2e)
+					write_hb_ec_tag_parity(index);
+				else
+					write_ec_tag_parity(index);
+
+				/* Sync with the dual tag */
+				flushecacheline(0,
+					cpunodes[CPU->cpu_id].ecache_size);
+				ec_ksp->tags_cleared.value.ul++;
+				ecache_scrub_log(ec_data, ec_tag, paddr,
+					CPU_ECACHE_TAG_ERR, afsr);
+				return;
+			} else {
+				ecache_scrub_log(ec_data, ec_tag, paddr,
+					CPU_ECACHE_ADDR_PAR_ERR, afsr);
+				cmn_err(CE_PANIC, " E$ tag address has bad"
+							" parity");
+			}
+		} else if ((afsr_ets & cpu_ec_parity) == 0) {
+			/*
+			 * ETS is zero but ETP is set
+			 */
+			ecache_scrub_log(ec_data, ec_tag, paddr,
+				CPU_ECACHE_ETP_ETS_ERR, afsr);
+			cmn_err(CE_PANIC, "AFSR.ETP is set and"
+				" AFSR.ETS is zero");
+		}
+	} else {
+		/*
+		 * E$ tag state bit has a bad parity
+		 */
+		ecache_scrub_log(ec_data, ec_tag, paddr,
+				CPU_ECACHE_STATE_ERR, afsr);
+		cmn_err(CE_PANIC, "E$ tag state has bad parity");
+	}
+}
+
+static void
+ecache_page_retire(void *arg)
+{
+	uint64_t paddr = (uint64_t)arg;
+	page_t *pp = page_numtopp_nolock((pfn_t)(paddr >> MMU_PAGESHIFT));
+
+	if (pp) {
+		page_settoxic(pp, PAGE_IS_FAULTY);
+		(void) page_retire(pp, PAGE_IS_TOXIC);
+	}
+}
+
+void
+sticksync_slave(void)
+{}
+
+void
+sticksync_master(void)
+{}
+
+/*ARGSUSED*/
+void
+cpu_check_ce(int flag, uint64_t pa, caddr_t va, uint_t bpp)
+{}
+
+void
+cpu_run_bus_error_handlers(struct async_flt *aflt, int expected)
+{
+	int status;
+	ddi_fm_error_t de;
+
+	bzero(&de, sizeof (ddi_fm_error_t));
+
+	de.fme_ena = fm_ena_generate_cpu(aflt->flt_id, aflt->flt_inst,
+	    FM_ENA_FMT1);
+	de.fme_flag = expected;
+	de.fme_bus_specific = (void *)aflt->flt_addr;
+	status = ndi_fm_handler_dispatch(ddi_root_node(), NULL, &de);
+
+	if ((aflt->flt_prot == AFLT_PROT_NONE) && (status == DDI_FM_FATAL))
+		aflt->flt_panic = 1;
+}
+
+/*ARGSUSED*/
+void
+cpu_errorq_dispatch(char *error_class, void *payload, size_t payload_sz,
+    errorq_t *eqp, uint_t flag)
+{
+	struct async_flt *aflt = (struct async_flt *)payload;
+
+	aflt->flt_erpt_class = error_class;
+	errorq_dispatch(eqp, payload, payload_sz, flag);
+}
+
+#define	MAX_SIMM	8
+
+struct ce_info {
+	char    name[UNUM_NAMLEN];
+	uint64_t intermittent_total;
+	uint64_t persistent_total;
+	uint64_t sticky_total;
+	unsigned short leaky_bucket_cnt;
+};
+
+/*
+ * Separately-defined structure for use in reporting the ce_info
+ * to SunVTS without exposing the internal layout and implementation
+ * of struct ce_info.
+ */
+static struct ecc_error_info ecc_error_info_data = {
+	{ "version", KSTAT_DATA_UINT32 },
+	{ "maxcount", KSTAT_DATA_UINT32 },
+	{ "count", KSTAT_DATA_UINT32 }
+};
+static const size_t ecc_error_info_ndata = sizeof (ecc_error_info_data) /
+    sizeof (struct kstat_named);
+
+#if KSTAT_CE_UNUM_NAMLEN < UNUM_NAMLEN
+#error "Need to rev ecc_error_info version and update KSTAT_CE_UNUM_NAMLEN"
+#endif
+
+struct ce_info  *mem_ce_simm = NULL;
+size_t mem_ce_simm_size = 0;
+
+/*
+ * Default values for the number of CE's allowed per interval.
+ * Interval is defined in minutes
+ * SOFTERR_MIN_TIMEOUT is defined in microseconds
+ */
+#define	SOFTERR_LIMIT_DEFAULT		2
+#define	SOFTERR_INTERVAL_DEFAULT	1440		/* This is 24 hours */
+#define	SOFTERR_MIN_TIMEOUT		(60 * MICROSEC)	/* This is 1 minute */
+#define	TIMEOUT_NONE			((timeout_id_t)0)
+#define	TIMEOUT_SET			((timeout_id_t)1)
+
+/*
+ * timeout identifer for leaky_bucket
+ */
+static timeout_id_t leaky_bucket_timeout_id = TIMEOUT_NONE;
+
+/*
+ * Tunables for maximum number of allowed CE's in a given time
+ */
+int ecc_softerr_limit = SOFTERR_LIMIT_DEFAULT;
+int ecc_softerr_interval = SOFTERR_INTERVAL_DEFAULT;
+
+void
+cpu_mp_init(void)
+{
+	size_t size = cpu_aflt_size();
+	size_t i;
+	kstat_t *ksp;
+
+	/*
+	 * Initialize the CE error handling buffers.
+	 */
+	mem_ce_simm_size = MAX_SIMM * max_ncpus;
+	size = sizeof (struct ce_info) * mem_ce_simm_size;
+	mem_ce_simm = kmem_zalloc(size, KM_SLEEP);
+
+	ksp = kstat_create("unix", 0, "ecc-info", "misc",
+	    KSTAT_TYPE_NAMED, ecc_error_info_ndata, KSTAT_FLAG_VIRTUAL);
+	if (ksp != NULL) {
+		ksp->ks_data = (struct kstat_named *)&ecc_error_info_data;
+		ecc_error_info_data.version.value.ui32 = KSTAT_CE_INFO_VER;
+		ecc_error_info_data.maxcount.value.ui32 = mem_ce_simm_size;
+		ecc_error_info_data.count.value.ui32 = 0;
+		kstat_install(ksp);
+	}
+
+	for (i = 0; i < mem_ce_simm_size; i++) {
+		struct kstat_ecc_mm_info *kceip;
+
+		kceip = kmem_zalloc(sizeof (struct kstat_ecc_mm_info),
+		    KM_SLEEP);
+		ksp = kstat_create("mm", i, "ecc-info", "misc",
+		    KSTAT_TYPE_NAMED,
+		    sizeof (struct kstat_ecc_mm_info) / sizeof (kstat_named_t),
+		    KSTAT_FLAG_VIRTUAL);
+		if (ksp != NULL) {
+			/*
+			 * Re-declare ks_data_size to include room for the
+			 * UNUM name since we don't have KSTAT_FLAG_VAR_SIZE
+			 * set.
+			 */
+			ksp->ks_data_size = sizeof (struct kstat_ecc_mm_info) +
+			    KSTAT_CE_UNUM_NAMLEN;
+			ksp->ks_data = kceip;
+			kstat_named_init(&kceip->name,
+			    "name", KSTAT_DATA_STRING);
+			kstat_named_init(&kceip->intermittent_total,
+			    "intermittent_total", KSTAT_DATA_UINT64);
+			kstat_named_init(&kceip->persistent_total,
+			    "persistent_total", KSTAT_DATA_UINT64);
+			kstat_named_init(&kceip->sticky_total,
+			    "sticky_total", KSTAT_DATA_UINT64);
+			/*
+			 * Use the default snapshot routine as it knows how to
+			 * deal with named kstats with long strings.
+			 */
+			ksp->ks_update = ecc_kstat_update;
+			kstat_install(ksp);
+		} else {
+			kmem_free(kceip, sizeof (struct kstat_ecc_mm_info));
+		}
+	}
+}
+
+/*ARGSUSED*/
+static void
+leaky_bucket_timeout(void *arg)
+{
+	int i;
+	struct ce_info *psimm = mem_ce_simm;
+
+	for (i = 0; i < mem_ce_simm_size; i++) {
+		if (psimm[i].leaky_bucket_cnt > 0)
+			atomic_add_16(&psimm[i].leaky_bucket_cnt, -1);
+	}
+	add_leaky_bucket_timeout();
+}
+
+static void
+add_leaky_bucket_timeout(void)
+{
+	long timeout_in_microsecs;
+
+	/*
+	 * create timeout for next leak.
+	 *
+	 * The timeout interval is calculated as follows
+	 *
+	 * (ecc_softerr_interval * 60 * MICROSEC) / ecc_softerr_limit
+	 *
+	 * ecc_softerr_interval is in minutes, so multiply this by 60 (seconds
+	 * in a minute), then multiply this by MICROSEC to get the interval
+	 * in microseconds.  Divide this total by ecc_softerr_limit so that
+	 * the timeout interval is accurate to within a few microseconds.
+	 */
+
+	if (ecc_softerr_limit <= 0)
+		ecc_softerr_limit = SOFTERR_LIMIT_DEFAULT;
+	if (ecc_softerr_interval <= 0)
+		ecc_softerr_interval = SOFTERR_INTERVAL_DEFAULT;
+
+	timeout_in_microsecs = ((int64_t)ecc_softerr_interval * 60 * MICROSEC) /
+	    ecc_softerr_limit;
+
+	if (timeout_in_microsecs < SOFTERR_MIN_TIMEOUT)
+		timeout_in_microsecs = SOFTERR_MIN_TIMEOUT;
+
+	leaky_bucket_timeout_id = timeout(leaky_bucket_timeout,
+	    (void *)NULL, drv_usectohz((clock_t)timeout_in_microsecs));
+}
+
+/*
+ * Legacy Correctable ECC Error Hash
+ *
+ * All of the code below this comment is used to implement a legacy array
+ * which counted intermittent, persistent, and sticky CE errors by unum,
+ * and then was later extended to publish the data as a kstat for SunVTS.
+ * All of this code is replaced by FMA, and remains here until such time
+ * that the UltraSPARC-I/II CPU code is converted to FMA, or is EOLed.
+ *
+ * Errors are saved in three buckets per-unum:
+ * (1) sticky - scrub was unsuccessful, cannot be scrubbed
+ *     This could represent a problem, and is immediately printed out.
+ * (2) persistent - was successfully scrubbed
+ *     These errors use the leaky bucket algorithm to determine
+ *     if there is a serious problem.
+ * (3) intermittent - may have originated from the cpu or upa/safari bus,
+ *     and does not necessarily indicate any problem with the dimm itself,
+ *     is critical information for debugging new hardware.
+ *     Because we do not know if it came from the dimm, it would be
+ *     inappropriate to include these in the leaky bucket counts.
+ *
+ * If the E$ line was modified before the scrub operation began, then the
+ * displacement flush at the beginning of scrubphys() will cause the modified
+ * line to be written out, which will clean up the CE.  Then, any subsequent
+ * read will not cause an error, which will cause persistent errors to be
+ * identified as intermittent.
+ *
+ * If a DIMM is going bad, it will produce true persistents as well as
+ * false intermittents, so these intermittents can be safely ignored.
+ *
+ * If the error count is excessive for a DIMM, this function will return
+ * PAGE_IS_FAILING, and the CPU module may then decide to remove that page
+ * from use.
+ */
+static int
+ce_count_unum(int status, int len, char *unum)
+{
+	int i;
+	struct ce_info *psimm = mem_ce_simm;
+	int page_status = PAGE_IS_OK;
+
+	ASSERT(psimm != NULL);
+
+	if (len <= 0 ||
+	    (status & (ECC_STICKY | ECC_PERSISTENT | ECC_INTERMITTENT)) == 0)
+		return (page_status);
+
+	/*
+	 * Initialize the leaky_bucket timeout
+	 */
+	if (casptr(&leaky_bucket_timeout_id,
+	    TIMEOUT_NONE, TIMEOUT_SET) == TIMEOUT_NONE)
+		add_leaky_bucket_timeout();
+
+	for (i = 0; i < mem_ce_simm_size; i++) {
+		if (psimm[i].name[0] == '\0') {
+			/*
+			 * Hit the end of the valid entries, add
+			 * a new one.
+			 */
+			(void) strncpy(psimm[i].name, unum, len);
+			if (status & ECC_STICKY) {
+				/*
+				 * Sticky - the leaky bucket is used to track
+				 * soft errors.  Since a sticky error is a
+				 * hard error and likely to be retired soon,
+				 * we do not count it in the leaky bucket.
+				 */
+				psimm[i].leaky_bucket_cnt = 0;
+				psimm[i].intermittent_total = 0;
+				psimm[i].persistent_total = 0;
+				psimm[i].sticky_total = 1;
+				cmn_err(CE_WARN,
+				    "[AFT0] Sticky Softerror encountered "
+				    "on Memory Module %s\n", unum);
+				page_status = PAGE_IS_FAILING;
+			} else if (status & ECC_PERSISTENT) {
+				psimm[i].leaky_bucket_cnt = 1;
+				psimm[i].intermittent_total = 0;
+				psimm[i].persistent_total = 1;
+				psimm[i].sticky_total = 0;
+			} else {
+				/*
+				 * Intermittent - Because the scrub operation
+				 * cannot find the error in the DIMM, we will
+				 * not count these in the leaky bucket
+				 */
+				psimm[i].leaky_bucket_cnt = 0;
+				psimm[i].intermittent_total = 1;
+				psimm[i].persistent_total = 0;
+				psimm[i].sticky_total = 0;
+			}
+			ecc_error_info_data.count.value.ui32++;
+			break;
+		} else if (strncmp(unum, psimm[i].name, len) == 0) {
+			/*
+			 * Found an existing entry for the current
+			 * memory module, adjust the counts.
+			 */
+			if (status & ECC_STICKY) {
+				psimm[i].sticky_total++;
+				cmn_err(CE_WARN,
+				    "[AFT0] Sticky Softerror encountered "
+				    "on Memory Module %s\n", unum);
+				page_status = PAGE_IS_FAILING;
+			} else if (status & ECC_PERSISTENT) {
+				int new_value;
+
+				new_value = atomic_add_16_nv(
+				    &psimm[i].leaky_bucket_cnt, 1);
+				psimm[i].persistent_total++;
+				if (new_value > ecc_softerr_limit) {
+					cmn_err(CE_WARN, "[AFT0] Most recent %d"
+					    " soft errors from Memory Module"
+					    " %s exceed threshold (N=%d,"
+					    " T=%dh:%02dm) triggering page"
+					    " retire", new_value, unum,
+					    ecc_softerr_limit,
+					    ecc_softerr_interval / 60,
+					    ecc_softerr_interval % 60);
+					atomic_add_16(
+					    &psimm[i].leaky_bucket_cnt, -1);
+					page_status = PAGE_IS_FAILING;
+				}
+			} else { /* Intermittent */
+				psimm[i].intermittent_total++;
+			}
+			break;
+		}
+	}
+
+	if (i >= mem_ce_simm_size)
+		cmn_err(CE_CONT, "[AFT0] Softerror: mem_ce_simm[] out of "
+		    "space.\n");
+
+	return (page_status);
+}
+
+/*
+ * Function to support counting of IO detected CEs.
+ */
+void
+cpu_ce_count_unum(struct async_flt *ecc, int len, char *unum)
+{
+	if (ce_count_unum(ecc->flt_status, len, unum) == PAGE_IS_FAILING &&
+	    automatic_page_removal) {
+		page_t *pp = page_numtopp_nolock((pfn_t)
+		    (ecc->flt_addr >> MMU_PAGESHIFT));
+
+		if (pp) {
+			page_settoxic(pp, PAGE_IS_FAULTY);
+			(void) page_retire(pp, PAGE_IS_FAILING);
+		}
+	}
+}
+
+static int
+ecc_kstat_update(kstat_t *ksp, int rw)
+{
+	struct kstat_ecc_mm_info *kceip = ksp->ks_data;
+	struct ce_info *ceip = mem_ce_simm;
+	int i = ksp->ks_instance;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	ASSERT(ksp->ks_data != NULL);
+	ASSERT(i < mem_ce_simm_size && i >= 0);
+
+	/*
+	 * Since we're not using locks, make sure that we don't get partial
+	 * data. The name is always copied before the counters are incremented
+	 * so only do this update routine if at least one of the counters is
+	 * non-zero, which ensures that ce_count_unum() is done, and the
+	 * string is fully copied.
+	 */
+	if (ceip[i].intermittent_total == 0 &&
+	    ceip[i].persistent_total == 0 &&
+	    ceip[i].sticky_total == 0) {
+		/*
+		 * Uninitialized or partially initialized. Ignore.
+		 * The ks_data buffer was allocated via kmem_zalloc,
+		 * so no need to bzero it.
+		 */
+		return (0);
+	}
+
+	kstat_named_setstr(&kceip->name, ceip[i].name);
+	kceip->intermittent_total.value.ui64 = ceip[i].intermittent_total;
+	kceip->persistent_total.value.ui64 = ceip[i].persistent_total;
+	kceip->sticky_total.value.ui64 = ceip[i].sticky_total;
+
+	return (0);
+}
+
+#define	VIS_BLOCKSIZE		64
+
+int
+dtrace_blksuword32_err(uintptr_t addr, uint32_t *data)
+{
+	int ret, watched;
+
+	watched = watch_disable_addr((void *)addr, VIS_BLOCKSIZE, S_WRITE);
+	ret = dtrace_blksuword32(addr, data, 0);
+	if (watched)
+		watch_enable_addr((void *)addr, VIS_BLOCKSIZE, S_WRITE);
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+void
+cpu_faulted_enter(struct cpu *cp)
+{
+}
+
+/*ARGSUSED*/
+void
+cpu_faulted_exit(struct cpu *cp)
+{
+}
+
+static int mmu_disable_ism_large_pages = ((1 << TTE512K) |
+	(1 << TTE32M) | (1 << TTE256M));
+static int mmu_disable_large_pages = ((1 << TTE32M) | (1 << TTE256M));
+
+/*
+ * The function returns the US_II mmu-specific values for the
+ * hat's disable_large_pages and disable_ism_large_pages variables.
+ */
+int
+mmu_large_pages_disabled(uint_t flag)
+{
+	int pages_disable = 0;
+
+	if (flag == HAT_LOAD) {
+		pages_disable = mmu_disable_large_pages;
+	} else if (flag == HAT_LOAD_SHARE) {
+		pages_disable = mmu_disable_ism_large_pages;
+	}
+	return (pages_disable);
+}
+
+/*ARGSUSED*/
+void
+mmu_init_kernel_pgsz(struct hat *hat)
+{
+}
+
+size_t
+mmu_get_kernel_lpsize(size_t lpsize)
+{
+	uint_t tte;
+
+	if (lpsize == 0) {
+		/* no setting for segkmem_lpsize in /etc/system: use default */
+		return (MMU_PAGESIZE4M);
+	}
+
+	for (tte = TTE8K; tte <= TTE4M; tte++) {
+		if (lpsize == TTEBYTES(tte))
+			return (lpsize);
+	}
+
+	return (TTEBYTES(TTE8K));
+}
diff --git a/usr/src/uts/sun4u/cpu/spitfire_asm.s b/usr/src/uts/sun4u/cpu/spitfire_asm.s
new file mode 100644
index 0000000000..9cdd0acd23
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/spitfire_asm.s
@@ -0,0 +1,2017 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* lint */
+
+#include <sys/asm_linkage.h>
+#include <sys/mmu.h>
+#include <vm/hat_sfmmu.h>
+#include <sys/machparam.h>
+#include <sys/machcpuvar.h>
+#include <sys/machthread.h>
+#include <sys/privregs.h>
+#include <sys/asm_linkage.h>
+#include <sys/machasi.h>
+#include <sys/trap.h>
+#include <sys/spitregs.h>
+#include <sys/xc_impl.h>
+#include <sys/intreg.h>
+#include <sys/async.h>
+
+#ifdef TRAPTRACE
+#include <sys/traptrace.h>
+#endif /* TRAPTRACE */
+
+#ifndef	lint
+
+/* BEGIN CSTYLED */
+#define	DCACHE_FLUSHPAGE(arg1, arg2, tmp1, tmp2, tmp3)			\
+	ldxa	[%g0]ASI_LSU, tmp1					;\
+	btst	LSU_DC, tmp1		/* is dcache enabled? */	;\
+	bz,pn	%icc, 1f						;\
+	sethi	%hi(dcache_linesize), tmp1				;\
+	ld	[tmp1 + %lo(dcache_linesize)], tmp1			;\
+	sethi	%hi(dflush_type), tmp2					;\
+	ld	[tmp2 + %lo(dflush_type)], tmp2				;\
+	cmp	tmp2, FLUSHPAGE_TYPE					;\
+	be,pt	%icc, 2f						;\
+	sllx	arg1, SF_DC_VBIT_SHIFT, arg1	/* tag to compare */	;\
+	sethi	%hi(dcache_size), tmp3					;\
+	ld	[tmp3 + %lo(dcache_size)], tmp3				;\
+	cmp	tmp2, FLUSHMATCH_TYPE					;\
+	be,pt	%icc, 3f						;\
+	nop								;\
+	/*								\
+	 * flushtype = FLUSHALL_TYPE, flush the whole thing		\
+	 * tmp3 = cache size						\
+	 * tmp1 = cache line size					\
+	 */								\
+	sub	tmp3, tmp1, tmp2					;\
+4:									\
+	stxa	%g0, [tmp2]ASI_DC_TAG					;\
+	membar	#Sync							;\
+	cmp	%g0, tmp2						;\
+	bne,pt	%icc, 4b						;\
+	sub	tmp2, tmp1, tmp2					;\
+	ba,pt	%icc, 1f						;\
+	nop								;\
+	/*								\
+	 * flushtype = FLUSHPAGE_TYPE					\
+	 * arg1 = tag to compare against				\
+	 * arg2 = virtual color						\
+	 * tmp1 = cache line size					\
+	 * tmp2 = tag from cache					\
+	 * tmp3 = counter						\
+	 */								\
+2:									\
+	set	MMU_PAGESIZE, tmp3					;\
+	sllx	arg2, MMU_PAGESHIFT, arg2  /* color to dcache page */	;\
+	sub	tmp3, tmp1, tmp3					;\
+4:									\
+	ldxa	[arg2 + tmp3]ASI_DC_TAG, tmp2	/* read tag */		;\
+	btst	SF_DC_VBIT_MASK, tmp2					;\
+	bz,pn	%icc, 5f	  /* branch if no valid sub-blocks */	;\
+	andn	tmp2, SF_DC_VBIT_MASK, tmp2	/* clear out v bits */	;\
+	cmp	tmp2, arg1						;\
+	bne,pn	%icc, 5f			/* br if tag miss */	;\
+	nop								;\
+	stxa	%g0, [arg2 + tmp3]ASI_DC_TAG				;\
+	membar	#Sync							;\
+5:									\
+	cmp	%g0, tmp3						;\
+	bnz,pt	%icc, 4b		/* branch if not done */	;\
+	sub	tmp3, tmp1, tmp3					;\
+	ba,pt	%icc, 1f						;\
+	nop								;\
+	/*								\
+	 * flushtype = FLUSHMATCH_TYPE					\
+	 * arg1 = tag to compare against				\
+	 * tmp1 = cache line size					\
+	 * tmp3 = cache size						\
+	 * arg2 = counter						\
+	 * tmp2 = cache tag						\
+	 */								\
+3:									\
+	sub	tmp3, tmp1, arg2					;\
+4:									\
+	ldxa	[arg2]ASI_DC_TAG, tmp2		/* read tag */		;\
+	btst	SF_DC_VBIT_MASK, tmp2					;\
+	bz,pn	%icc, 5f		/* br if no valid sub-blocks */	;\
+	andn	tmp2, SF_DC_VBIT_MASK, tmp2	/* clear out v bits */	;\
+	cmp	tmp2, arg1						;\
+	bne,pn	%icc, 5f		/* branch if tag miss */	;\
+	nop								;\
+	stxa	%g0, [arg2]ASI_DC_TAG					;\
+	membar	#Sync							;\
+5:									\
+	cmp	%g0, arg2						;\
+	bne,pt	%icc, 4b		/* branch if not done */	;\
+	sub	arg2, tmp1, arg2					;\
+1:
+
+/*
+ * macro that flushes the entire dcache color
+ */
+#define	DCACHE_FLUSHCOLOR(arg, tmp1, tmp2)				\
+	ldxa	[%g0]ASI_LSU, tmp1;					\
+	btst	LSU_DC, tmp1;		/* is dcache enabled? */	\
+	bz,pn	%icc, 1f;						\
+	sethi	%hi(dcache_linesize), tmp1;				\
+	ld	[tmp1 + %lo(dcache_linesize)], tmp1;			\
+	set	MMU_PAGESIZE, tmp2;					\
+	/*								\
+	 * arg = virtual color						\
+	 * tmp2 = page size						\
+	 * tmp1 = cache line size					\
+	 */								\
+	sllx	arg, MMU_PAGESHIFT, arg; /* color to dcache page */	\
+	sub	tmp2, tmp1, tmp2;					\
+2:									\
+	stxa	%g0, [arg + tmp2]ASI_DC_TAG;				\
+	membar	#Sync;							\
+	cmp	%g0, tmp2;						\
+	bne,pt	%icc, 2b;						\
+	sub	tmp2, tmp1, tmp2;					\
+1:
+
+/*
+ * macro that flushes the entire dcache
+ */
+#define	DCACHE_FLUSHALL(size, linesize, tmp)				\
+	ldxa	[%g0]ASI_LSU, tmp;					\
+	btst	LSU_DC, tmp;		/* is dcache enabled? */	\
+	bz,pn	%icc, 1f;						\
+									\
+	sub	size, linesize, tmp;					\
+2:									\
+	stxa	%g0, [tmp]ASI_DC_TAG;					\
+	membar	#Sync;							\
+	cmp	%g0, tmp;						\
+	bne,pt	%icc, 2b;						\
+	sub	tmp, linesize, tmp;					\
+1:
+
+/*
+ * macro that flushes the entire icache
+ */
+#define	ICACHE_FLUSHALL(size, linesize, tmp)				\
+	ldxa	[%g0]ASI_LSU, tmp;					\
+	btst	LSU_IC, tmp;						\
+	bz,pn	%icc, 1f;						\
+									\
+	sub	size, linesize, tmp;					\
+2:									\
+	stxa	%g0, [tmp]ASI_IC_TAG;					\
+	membar	#Sync;							\
+	cmp	%g0, tmp;						\
+	bne,pt	%icc, 2b;						\
+	sub	tmp, linesize, tmp;					\
+1:
+
+/*
+ * Macro for getting to offset from 'cpu_private' ptr. The 'cpu_private'
+ * ptr is in the machcpu structure.
+ * r_or_s:	Register or symbol off offset from 'cpu_private' ptr.
+ * scr1:	Scratch, ptr is returned in this register.
+ * scr2:	Scratch
+ */
+#define GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr2, label)		\
+	CPU_ADDR(scr1, scr2);						\
+	ldn	[scr1 + CPU_PRIVATE], scr1; 				\
+	cmp	scr1, 0; 						\
+	be	label;							\
+	 nop; 								\
+	add	scr1, r_or_s, scr1;  					\
+
+#ifdef HUMMINGBIRD
+/*
+ * UltraSPARC-IIe processor supports both 4-way set associative and
+ * direct map E$. For performance reasons, we flush E$ by placing it
+ * in direct map mode for data load/store and restore the state after
+ * we are done flushing it. Keep interrupts off while flushing in this
+ * manner.
+ *
+ * We flush the entire ecache by starting at one end and loading each
+ * successive ecache line for the 2*ecache-size range. We have to repeat
+ * the flush operation to guarantee that the entire ecache has been
+ * flushed.
+ *
+ * For flushing a specific physical address, we start at the aliased
+ * address and load at set-size stride, wrapping around at 2*ecache-size
+ * boundary and skipping the physical address being flushed. It takes
+ * 10 loads to guarantee that the physical address has been flushed.
+ */
+
+#define	HB_ECACHE_FLUSH_CNT	2
+#define	HB_PHYS_FLUSH_CNT	10	/* #loads to flush specific paddr */
+#endif /* HUMMINGBIRD */
+
+/* END CSTYLED */
+
+#endif	/* !lint */
+
+/*
+ * Spitfire MMU and Cache operations.
+ */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+vtag_flushpage(caddr_t vaddr, uint_t ctxnum)
+{}
+
+/*ARGSUSED*/
+void
+vtag_flushctx(uint_t ctxnum)
+{}
+
+/*ARGSUSED*/
+void
+vtag_flushall(void)
+{}
+
+/*ARGSUSED*/
+void
+vtag_flushpage_tl1(uint64_t vaddr, uint64_t ctxnum)
+{}
+
+/*ARGSUSED*/
+void
+vtag_flush_pgcnt_tl1(uint64_t vaddr, uint64_t ctx_pgcnt)
+{}
+
+/*ARGSUSED*/
+void
+vtag_flushctx_tl1(uint64_t ctxnum, uint64_t dummy)
+{}
+
+/*ARGSUSED*/
+void
+vtag_flushall_tl1(uint64_t dummy1, uint64_t dummy2)
+{}
+
+/*ARGSUSED*/
+void
+vac_flushpage(pfn_t pfnum, int vcolor)
+{}
+
+/*ARGSUSED*/
+void
+vac_flushpage_tl1(uint64_t pfnum, uint64_t vcolor)
+{}
+
+/*ARGSUSED*/
+void
+init_mondo(xcfunc_t *func, uint64_t arg1, uint64_t arg2)
+{}
+
+/*ARGSUSED*/
+void
+init_mondo_nocheck(xcfunc_t *func, uint64_t arg1, uint64_t arg2)
+{}
+
+/*ARGSUSED*/
+void
+flush_instr_mem(caddr_t vaddr, size_t len)
+{}
+
+/*ARGSUSED*/
+void
+flush_ecache(uint64_t physaddr, size_t size, size_t linesize)
+{}
+
+/*ARGSUSED*/
+void
+get_ecache_dtag(uint32_t ecache_idx, uint64_t *ecache_data,
+		uint64_t *ecache_tag, uint64_t *oafsr, uint64_t *acc_afsr)
+{}
+
+/* ARGSUSED */
+uint64_t
+get_ecache_tag(uint32_t id, uint64_t *nafsr, uint64_t *acc_afsr)
+{
+	return ((uint64_t)0);
+}
+
+/* ARGSUSED */
+uint64_t
+check_ecache_line(uint32_t id, uint64_t *acc_afsr)
+{
+	return ((uint64_t)0);
+}
+
+/*ARGSUSED*/
+void
+kdi_flush_idcache(int dcache_size, int dcache_lsize,
+    int icache_size, int icache_lsize)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(vtag_flushpage)
+	/*
+	 * flush page from the tlb
+	 *
+	 * %o0 = vaddr
+	 * %o1 = ctxnum
+	 */
+	rdpr	%pstate, %o5
+#ifdef DEBUG
+	andcc	%o5, PSTATE_IE, %g0		/* if interrupts already */
+	bnz,a,pt %icc, 3f			/* disabled, panic */
+	nop
+	save	%sp, -SA(MINFRAME), %sp
+	sethi	%hi(sfmmu_panic1), %o0
+	call	panic
+	  or	%o0, %lo(sfmmu_panic1), %o0
+	ret
+	restore
+3:
+#endif /* DEBUG */
+	/*
+	 * disable ints
+	 */
+	andn	%o5, PSTATE_IE, %o4
+	wrpr	%o4, 0, %pstate
+
+	/*
+	 * Then, blow out the tlb
+	 * Interrupts are disabled to prevent the secondary ctx register
+	 * from changing underneath us.
+	 */
+	brnz,pt	%o1, 1f			/* KCONTEXT? */
+	sethi	%hi(FLUSH_ADDR), %o3
+	/*
+	 * For KCONTEXT demaps use primary. type = page implicitly
+	 */
+	stxa	%g0, [%o0]ASI_DTLB_DEMAP	/* dmmu flush for KCONTEXT */
+	stxa	%g0, [%o0]ASI_ITLB_DEMAP	/* immu flush for KCONTEXT */
+	b	5f
+	  flush	%o3
+1:
+	/*
+	 * User demap.  We need to set the secondary context properly.
+	 * %o0 = vaddr
+	 * %o1 = ctxnum
+	 * %o3 = FLUSH_ADDR
+	 */
+	set	MMU_SCONTEXT, %o4
+	ldxa	[%o4]ASI_DMMU, %o2		/* rd old ctxnum */
+	or	DEMAP_SECOND | DEMAP_PAGE_TYPE, %o0, %o0
+	cmp	%o2, %o1
+	be,a,pt	%icc, 4f
+	  nop
+	stxa	%o1, [%o4]ASI_DMMU		/* wr new ctxum */
+4:
+	stxa	%g0, [%o0]ASI_DTLB_DEMAP
+	stxa	%g0, [%o0]ASI_ITLB_DEMAP
+	flush	%o3
+	be,a,pt	%icc, 5f
+	  nop
+	stxa	%o2, [%o4]ASI_DMMU		/* restore old ctxnum */
+	flush	%o3
+5:
+	retl
+	  wrpr	%g0, %o5, %pstate		/* enable interrupts */
+	SET_SIZE(vtag_flushpage)
+
+	ENTRY_NP(vtag_flushctx)
+	/*
+	 * flush context from the tlb
+	 *
+	 * %o0 = ctxnum
+	 * We disable interrupts to prevent the secondary ctx register changing
+	 * underneath us.
+	 */
+	sethi	%hi(FLUSH_ADDR), %o3
+	set	DEMAP_CTX_TYPE | DEMAP_SECOND, %g1
+	rdpr	%pstate, %o2
+
+#ifdef DEBUG
+	andcc	%o2, PSTATE_IE, %g0		/* if interrupts already */
+	bnz,a,pt %icc, 1f			/* disabled, panic	 */
+	  nop
+	sethi	%hi(sfmmu_panic1), %o0
+	call	panic
+	  or	%o0, %lo(sfmmu_panic1), %o0
+1:
+#endif /* DEBUG */
+
+	wrpr	%o2, PSTATE_IE, %pstate		/* disable interrupts */
+	set	MMU_SCONTEXT, %o4
+	ldxa	[%o4]ASI_DMMU, %o5		/* rd old ctxnum */
+	cmp	%o5, %o0
+	be,a,pt	%icc, 4f
+	  nop
+	stxa	%o0, [%o4]ASI_DMMU		/* wr new ctxum */
+4:
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	flush	%o3
+	be,a,pt	%icc, 5f
+	  nop
+	stxa	%o5, [%o4]ASI_DMMU		/* restore old ctxnum */
+	flush	%o3
+5:
+	retl
+	  wrpr	%g0, %o2, %pstate		/* enable interrupts */
+	SET_SIZE(vtag_flushctx)
+
+	.seg	".text"
+.flushallmsg:
+	.asciz	"sfmmu_asm: unimplemented flush operation"
+
+	ENTRY_NP(vtag_flushall)
+	sethi	%hi(.flushallmsg), %o0
+	call	panic
+	  or	%o0, %lo(.flushallmsg), %o0
+	SET_SIZE(vtag_flushall)
+
+	ENTRY_NP(vtag_flushpage_tl1)
+	/*
+	 * x-trap to flush page from tlb and tsb
+	 *
+	 * %g1 = vaddr, zero-extended on 32-bit kernel
+	 * %g2 = ctxnum
+	 *
+	 * assumes TSBE_TAG = 0
+	 */
+	srln	%g1, MMU_PAGESHIFT, %g1
+	slln	%g1, MMU_PAGESHIFT, %g1			/* g1 = vaddr */
+	/* We need to set the secondary context properly. */
+	set	MMU_SCONTEXT, %g4
+	ldxa	[%g4]ASI_DMMU, %g5		/* rd old ctxnum */
+	or	DEMAP_SECOND | DEMAP_PAGE_TYPE, %g1, %g1
+	stxa	%g2, [%g4]ASI_DMMU		/* wr new ctxum */
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	stxa	%g5, [%g4]ASI_DMMU		/* restore old ctxnum */
+	membar #Sync
+	retry
+	SET_SIZE(vtag_flushpage_tl1)
+
+	ENTRY_NP(vtag_flush_pgcnt_tl1)
+	/*
+	 * x-trap to flush pgcnt MMU_PAGESIZE pages from tlb
+	 *
+	 * %g1 = vaddr, zero-extended on 32-bit kernel
+	 * %g2 = <zero32|ctx16|pgcnt16>
+	 *
+	 * NOTE: this handler relies on the fact that no
+	 *	interrupts or traps can occur during the loop
+	 *	issuing the TLB_DEMAP operations. It is assumed
+	 *	that interrupts are disabled and this code is
+	 *	fetching from the kernel locked text address.
+	 *
+	 * assumes TSBE_TAG = 0
+	 */
+	srln	%g1, MMU_PAGESHIFT, %g1
+	slln	%g1, MMU_PAGESHIFT, %g1		/* g1 = vaddr */
+	or	DEMAP_SECOND | DEMAP_PAGE_TYPE, %g1, %g1
+	set	0xffff, %g4
+	and	%g4, %g2, %g3			/* g3 = pgcnt */
+	srln	%g2, 16, %g2			/* g2 = ctxnum */
+	/* We need to set the secondary context properly. */
+	set	MMU_SCONTEXT, %g4
+	ldxa	[%g4]ASI_DMMU, %g5		/* read old ctxnum */
+	stxa	%g2, [%g4]ASI_DMMU		/* write new ctxum */
+
+	set	MMU_PAGESIZE, %g2		/* g2 = pgsize */
+1:
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	deccc	%g3				/* decr pgcnt */
+	bnz,pt	%icc,1b
+	add	%g1, %g2, %g1			/* go to nextpage */
+
+	stxa	%g5, [%g4]ASI_DMMU		/* restore old ctxnum */
+	membar #Sync
+	retry
+	SET_SIZE(vtag_flush_pgcnt_tl1)
+
+	ENTRY_NP(vtag_flushctx_tl1)
+	/*
+	 * x-trap to flush context from tlb
+	 *
+	 * %g1 = ctxnum
+	 */
+	set	DEMAP_CTX_TYPE | DEMAP_SECOND, %g4
+	set	MMU_SCONTEXT, %g3
+	ldxa	[%g3]ASI_DMMU, %g5		/* rd old ctxnum */
+	stxa	%g1, [%g3]ASI_DMMU		/* wr new ctxum */
+	stxa	%g0, [%g4]ASI_DTLB_DEMAP
+	stxa	%g0, [%g4]ASI_ITLB_DEMAP
+	stxa	%g5, [%g3]ASI_DMMU		/* restore old ctxnum */
+	membar #Sync
+	retry
+	SET_SIZE(vtag_flushctx_tl1)
+
+	! Not implemented on US1/US2
+	ENTRY_NP(vtag_flushall_tl1)
+	retry
+	SET_SIZE(vtag_flushall_tl1)
+
+/*
+ * vac_flushpage(pfnum, color)
+ *	Flush 1 8k page of the D-$ with physical page = pfnum
+ *	Algorithm:
+ *		The spitfire dcache is a 16k direct mapped virtual indexed,
+ *		physically tagged cache.  Given the pfnum we read all cache
+ *		lines for the corresponding page in the cache (determined by
+ *		the color).  Each cache line is compared with
+ *		the tag created from the pfnum. If the tags match we flush
+ *		the line.
+ */
+	.seg	".data"
+	.align	8
+	.global	dflush_type
+dflush_type:
+	.word	FLUSHPAGE_TYPE
+	.seg	".text"
+
+	ENTRY(vac_flushpage)
+	/*
+	 * flush page from the d$
+	 *
+	 * %o0 = pfnum, %o1 = color
+	 */
+	DCACHE_FLUSHPAGE(%o0, %o1, %o2, %o3, %o4)
+	retl
+	nop
+	SET_SIZE(vac_flushpage)
+
+	ENTRY_NP(vac_flushpage_tl1)
+	/*
+	 * x-trap to flush page from the d$
+	 *
+	 * %g1 = pfnum, %g2 = color
+	 */
+	DCACHE_FLUSHPAGE(%g1, %g2, %g3, %g4, %g5)
+	retry
+	SET_SIZE(vac_flushpage_tl1)
+
+	ENTRY(vac_flushcolor)
+	/*
+	 * %o0 = vcolor
+	 */
+	DCACHE_FLUSHCOLOR(%o0, %o1, %o2)
+	retl
+	  nop
+	SET_SIZE(vac_flushcolor)
+
+	ENTRY(vac_flushcolor_tl1)
+	/*
+	 * %g1 = vcolor
+	 */
+	DCACHE_FLUSHCOLOR(%g1, %g2, %g3)
+	retry
+	SET_SIZE(vac_flushcolor_tl1)
+
+
+	.global _dispatch_status_busy
+_dispatch_status_busy:
+	.asciz	"ASI_INTR_DISPATCH_STATUS error: busy"
+	.align	4
+
+/*
+ * Determine whether or not the IDSR is busy.
+ * Entry: no arguments
+ * Returns: 1 if busy, 0 otherwise
+ */
+	ENTRY(idsr_busy)
+	ldxa	[%g0]ASI_INTR_DISPATCH_STATUS, %g1
+	clr	%o0
+	btst	IDSR_BUSY, %g1
+	bz,a,pt	%xcc, 1f
+	mov	1, %o0
+1:
+	retl
+	nop
+	SET_SIZE(idsr_busy)
+	
+/*
+ * Setup interrupt dispatch data registers
+ * Entry:
+ *	%o0 - function or inumber to call
+ *	%o1, %o2 - arguments (2 uint64_t's)
+ */
+	.seg "text"
+
+	ENTRY(init_mondo)
+#ifdef DEBUG
+	!
+	! IDSR should not be busy at the moment
+	!
+	ldxa	[%g0]ASI_INTR_DISPATCH_STATUS, %g1
+	btst	IDSR_BUSY, %g1
+	bz,pt	%xcc, 1f
+	nop
+
+	sethi	%hi(_dispatch_status_busy), %o0
+	call	panic
+	or	%o0, %lo(_dispatch_status_busy), %o0
+#endif /* DEBUG */
+
+	ALTENTRY(init_mondo_nocheck)
+	!
+	! interrupt vector dispach data reg 0
+	!
+1:
+	mov	IDDR_0, %g1
+	mov	IDDR_1, %g2
+	mov	IDDR_2, %g3
+	stxa	%o0, [%g1]ASI_INTR_DISPATCH
+
+	!
+	! interrupt vector dispach data reg 1
+	!
+	stxa	%o1, [%g2]ASI_INTR_DISPATCH
+
+	!
+	! interrupt vector dispach data reg 2
+	!
+	stxa	%o2, [%g3]ASI_INTR_DISPATCH
+
+	retl
+	membar	#Sync			! allowed to be in the delay slot
+	SET_SIZE(init_mondo)
+
+/*
+ * Ship mondo to upaid
+ */
+	ENTRY_NP(shipit)
+	sll	%o0, IDCR_PID_SHIFT, %g1	! IDCR<18:14> = upa id
+	or	%g1, IDCR_OFFSET, %g1		! IDCR<13:0> = 0x70
+	stxa	%g0, [%g1]ASI_INTR_DISPATCH	! interrupt vector dispatch
+#if defined(SF_ERRATA_54)
+	membar	#Sync				! store must occur before load
+	mov	0x20, %g3			! UDBH Control Register Read
+	ldxa	[%g3]ASI_SDB_INTR_R, %g0
+#endif
+	retl
+	membar	#Sync
+	SET_SIZE(shipit)
+
+
+/*
+ * flush_instr_mem:
+ *	Flush a portion of the I-$ starting at vaddr
+ * 	%o0 vaddr
+ *	%o1 bytes to be flushed
+ */
+
+	ENTRY(flush_instr_mem)
+	membar	#StoreStore				! Ensure the stores
+							! are globally visible
+1:
+	flush	%o0
+	subcc	%o1, ICACHE_FLUSHSZ, %o1		! bytes = bytes-0x20
+	bgu,pt	%ncc, 1b
+	add	%o0, ICACHE_FLUSHSZ, %o0		! vaddr = vaddr+0x20
+
+	retl
+	nop
+	SET_SIZE(flush_instr_mem)
+
+/*
+ * flush_ecache:
+ * Flush the entire e$ using displacement flush by reading through a
+ * physically contiguous area. We use mmu bypass asi (ASI_MEM) while
+ * reading this physical address range so that data doesn't go to d$.
+ * incoming arguments:
+ *	%o0 - 64 bit physical address
+ *	%o1 - size of address range to read
+ *	%o2 - ecache linesize
+ */
+	ENTRY(flush_ecache)
+#ifndef HUMMINGBIRD
+	b	2f
+	  nop
+1:
+	ldxa	[%o0 + %o1]ASI_MEM, %g0	! start reading from physaddr + size
+2:
+	subcc	%o1, %o2, %o1
+	bcc,a,pt %ncc, 1b
+	  nop
+
+#else /* HUMMINGBIRD */
+	/*
+	 * UltraSPARC-IIe processor supports both 4-way set associative
+	 * and direct map E$. For performance reasons, we flush E$ by
+	 * placing it in direct map mode for data load/store and restore
+	 * the state after we are done flushing it. It takes 2 iterations
+	 * to guarantee that the entire ecache has been flushed.
+	 *
+	 * Keep the interrupts disabled while flushing E$ in this manner.
+	 */
+	rdpr	%pstate, %g4		! current pstate (restored later)
+	andn	%g4, PSTATE_IE, %g5
+	wrpr	%g0, %g5, %pstate	! disable interrupts
+
+	! Place E$ in direct map mode for data access
+	or	%g0, 1, %g5
+	sllx	%g5, HB_UPA_DMAP_DATA_BIT, %g5
+	ldxa	[%g0]ASI_UPA_CONFIG, %g1 ! current UPA config (restored later)
+	or	%g1, %g5, %g5
+	membar	#Sync
+	stxa	%g5, [%g0]ASI_UPA_CONFIG ! enable direct map for data access
+	membar	#Sync
+
+	! flush entire ecache HB_ECACHE_FLUSH_CNT times
+	mov	HB_ECACHE_FLUSH_CNT-1, %g5
+2:
+	sub	%o1, %o2, %g3		! start from last entry
+1:
+	ldxa	[%o0 + %g3]ASI_MEM, %g0	! start reading from physaddr + size
+	subcc	%g3, %o2, %g3
+	bgeu,a,pt %ncc, 1b
+	  nop
+	brgz,a,pt %g5, 2b
+	  dec	%g5
+
+	membar	#Sync
+	stxa	%g1, [%g0]ASI_UPA_CONFIG ! restore UPA config reg
+	membar	#Sync
+	wrpr	%g0, %g4, %pstate	! restore earlier pstate
+#endif /* HUMMINGBIRD */
+
+	retl
+	nop
+	SET_SIZE(flush_ecache)
+
+/*
+ * void kdi_flush_idcache(int dcache_size, int dcache_linesize,
+ *			int icache_size, int icache_linesize)
+ */
+	ENTRY(kdi_flush_idcache)
+	DCACHE_FLUSHALL(%o0, %o1, %g1)
+	ICACHE_FLUSHALL(%o2, %o3, %g1)
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(kdi_flush_idcache)
+	
+
+/*
+ * void get_ecache_dtag(uint32_t ecache_idx, uint64_t *data, uint64_t *tag,
+ * 			uint64_t *oafsr, uint64_t *acc_afsr)
+ *
+ * Get ecache data and tag.  The ecache_idx argument is assumed to be aligned
+ * on a 64-byte boundary.  The corresponding AFSR value is also read for each
+ * 8 byte ecache data obtained. The ecache data is assumed to be a pointer
+ * to an array of 16 uint64_t's (e$data & afsr value).  The action to read the
+ * data and tag should be atomic to make sense.  We will be executing at PIL15
+ * and will disable IE, so nothing can occur between the two reads.  We also
+ * assume that the execution of this code does not interfere with what we are
+ * reading - not really possible, but we'll live with it for now.
+ * We also pass the old AFSR value before clearing it, and caller will take
+ * appropriate actions if the important bits are non-zero. 
+ *
+ * If the caller wishes to track the AFSR in cases where the CP bit is
+ * set, an address should be passed in for acc_afsr.  Otherwise, this
+ * argument may be null.
+ *
+ * Register Usage:
+ * i0: In: 32-bit e$ index
+ * i1: In: addr of e$ data
+ * i2: In: addr of e$ tag
+ * i3: In: addr of old afsr
+ * i4: In: addr of accumulated afsr - may be null
+ */
+	ENTRY(get_ecache_dtag)
+	save	%sp, -SA(MINFRAME), %sp
+	or	%g0, 1, %l4
+	sllx	%l4, 39, %l4	! set bit 39 for e$ data access
+	or	%i0, %l4, %g6	! %g6 = e$ addr for data read
+	sllx	%l4, 1, %l4	! set bit 40 for e$ tag access
+	or	%i0, %l4, %l4	! %l4 = e$ addr for tag read
+
+	rdpr    %pstate, %i5
+	andn    %i5, PSTATE_IE | PSTATE_AM, %i0
+	wrpr    %i0, %g0, %pstate       ! clear IE, AM bits
+
+	ldxa    [%g0]ASI_ESTATE_ERR, %g1
+	stxa    %g0, [%g0]ASI_ESTATE_ERR        ! disable errors
+	membar  #Sync
+
+	ldxa	[%g0]ASI_AFSR, %i0      ! grab the old-afsr before tag read
+	stx     %i0, [%i3]		! write back the old-afsr
+
+	ldxa    [%l4]ASI_EC_R, %g0      ! read tag into E$ tag reg
+	ldxa    [%g0]ASI_EC_DIAG, %i0   ! read tag from E$ tag reg
+	stx     %i0, [%i2]              ! write back tag result
+
+	clr	%i2			! loop count
+
+	brz	%i4, 1f			! acc_afsr == NULL?
+	  ldxa	[%g0]ASI_AFSR, %i0      ! grab the old-afsr before clearing
+	srlx	%i0, P_AFSR_CP_SHIFT, %l0
+	btst	1, %l0
+	bz	1f
+	  nop
+	ldx	[%i4], %g4
+	or	%g4, %i0, %g4		! aggregate AFSR in cpu private
+	stx	%g4, [%i4]
+1:
+	stxa    %i0, [%g0]ASI_AFSR	! clear AFSR
+	membar  #Sync
+	ldxa    [%g6]ASI_EC_R, %i0      ! read the 8byte E$data
+	stx     %i0, [%i1]              ! save the E$data
+	add     %g6, 8, %g6
+	add     %i1, 8, %i1
+	ldxa    [%g0]ASI_AFSR, %i0      ! read AFSR for this 16byte read
+	srlx	%i0, P_AFSR_CP_SHIFT, %l0
+	btst	1, %l0
+	bz	2f
+	  stx     %i0, [%i1]		! save the AFSR
+
+	brz	%i4, 2f			! acc_afsr == NULL?
+	  nop
+	ldx	[%i4], %g4
+	or	%g4, %i0, %g4		! aggregate AFSR in cpu private
+	stx	%g4, [%i4]
+2:
+	add     %i2, 8, %i2
+	cmp     %i2, 64
+	bl,a    1b
+	  add     %i1, 8, %i1
+	stxa    %i0, [%g0]ASI_AFSR              ! clear AFSR
+	membar  #Sync
+	stxa    %g1, [%g0]ASI_ESTATE_ERR        ! restore error enable
+	membar  #Sync
+	wrpr    %g0, %i5, %pstate
+	ret
+	  restore
+	SET_SIZE(get_ecache_dtag)
+#endif /* lint */
+
+#if defined(lint)
+/*
+ * The ce_err function handles trap type 0x63 (corrected_ECC_error) at tl=0.
+ * Steps: 1. GET AFSR  2. Get AFAR <40:4> 3. Get datapath error status
+ *	  4. Clear datapath error bit(s) 5. Clear AFSR error bit
+ *	  6. package data in %g2 and %g3 7. call cpu_ce_error vis sys_trap
+ * %g2: [ 52:43 UDB lower | 42:33 UDB upper | 32:0 afsr ] - arg #3/arg #1
+ * %g3: [ 40:4 afar ] - sys_trap->have_win: arg #4/arg #2
+ */
+void
+ce_err(void)
+{}
+
+void
+ce_err_tl1(void)
+{}
+
+
+/*
+ * The async_err function handles trap types 0x0A (instruction_access_error)
+ * and 0x32 (data_access_error) at TL = 0 and TL > 0.  When we branch here,
+ * %g5 will have the trap type (with 0x200 set if we're at TL > 0).
+ *
+ * Steps: 1. Get AFSR 2. Get AFAR <40:4> 3. If not UE error skip UDP registers.
+ *	  4. Else get and clear datapath error bit(s) 4. Clear AFSR error bits
+ *	  6. package data in %g2 and %g3 7. disable all cpu errors, because
+ *	  trap is likely to be fatal 8. call cpu_async_error vis sys_trap
+ *
+ * %g3: [ 63:53 tt | 52:43 UDB_L | 42:33 UDB_U | 32:0 afsr ] - arg #3/arg #1
+ * %g2: [ 40:4 afar ] - sys_trap->have_win: arg #4/arg #2
+ */
+void
+async_err(void)
+{}
+
+/*
+ * The clr_datapath function clears any error bits set in the UDB regs.
+ */
+void
+clr_datapath(void)
+{}
+
+/*
+ * The get_udb_errors() function gets the current value of the
+ * Datapath Error Registers.
+ */
+/*ARGSUSED*/
+void
+get_udb_errors(uint64_t *udbh, uint64_t *udbl)
+{
+	*udbh = 0;
+	*udbl = 0;
+}
+
+#else 	/* lint */
+
+	ENTRY_NP(ce_err)
+	ldxa	[%g0]ASI_AFSR, %g3	! save afsr in g3
+
+	!
+	! Check for a UE... From Kevin.Normoyle:
+	! We try to switch to the trap for the UE, but since that's
+	! a hardware pipeline, we might get to the CE trap before we
+	! can switch. The UDB and AFSR registers will have both the
+	! UE and CE bits set but the UDB syndrome and the AFAR will be
+	! for the UE.
+	!
+	or	%g0, 1, %g1		! put 1 in g1
+	sllx	%g1, 21, %g1		! shift left to <21> afsr UE
+	andcc	%g1, %g3, %g0		! check for UE in afsr
+	bnz	async_err		! handle the UE, not the CE
+	  or	%g0, 0x63, %g5		! pass along the CE ttype
+	!
+	! Disable further CE traps to avoid recursion (stack overflow)
+	! and staying above XCALL_PIL for extended periods.
+	!
+	ldxa	[%g0]ASI_ESTATE_ERR, %g2
+	andn	%g2, 0x1, %g2		! clear bit 0 - CEEN
+	stxa	%g2, [%g0]ASI_ESTATE_ERR
+	membar	#Sync			! required
+	!
+	! handle the CE
+	ldxa	[%g0]ASI_AFAR, %g2	! save afar in g2
+
+	set	P_DER_H, %g4		! put P_DER_H in g4
+	ldxa	[%g4]ASI_SDB_INTR_R, %g5 ! read sdb upper half into g5
+	or	%g0, 1, %g6		! put 1 in g6
+	sllx	%g6, 8, %g6		! shift g6 to <8> sdb CE
+	andcc	%g5, %g6, %g1		! check for CE in upper half
+	sllx	%g5, 33, %g5		! shift upper bits to <42:33>
+	or	%g3, %g5, %g3		! or with afsr bits
+	bz,a	1f			! no error, goto 1f
+	  nop
+	stxa	%g1, [%g4]ASI_SDB_INTR_W ! clear sdb reg error bit
+	membar	#Sync			! membar sync required
+1:
+	set	P_DER_L, %g4		! put P_DER_L in g4
+	ldxa	[%g4]ASI_SDB_INTR_R, %g5 ! read sdb lower half into g6
+	andcc	%g5, %g6, %g1		! check for CE in lower half
+	sllx	%g5, 43, %g5		! shift upper bits to <52:43>
+	or	%g3, %g5, %g3		! or with afsr bits
+	bz,a	2f			! no error, goto 2f
+	  nop
+	stxa	%g1, [%g4]ASI_SDB_INTR_W ! clear sdb reg error bit
+	membar	#Sync			! membar sync required
+2:
+	or	%g0, 1, %g4		! put 1 in g4
+	sllx	%g4, 20, %g4		! shift left to <20> afsr CE
+	stxa	%g4, [%g0]ASI_AFSR	! use g4 to clear afsr CE error
+	membar	#Sync			! membar sync required
+
+	set	cpu_ce_error, %g1	! put *cpu_ce_error() in g1
+	rdpr	%pil, %g6		! read pil into %g6
+	subcc	%g6, PIL_15, %g0
+	  movneg	%icc, PIL_14, %g4 ! run at pil 14 unless already at 15
+	sethi	%hi(sys_trap), %g5
+	jmp	%g5 + %lo(sys_trap)	! goto sys_trap
+	  movge	%icc, PIL_15, %g4	! already at pil 15
+	SET_SIZE(ce_err)
+
+	ENTRY_NP(ce_err_tl1)
+#ifndef	TRAPTRACE
+	ldxa	[%g0]ASI_AFSR, %g7
+	stxa	%g7, [%g0]ASI_AFSR
+	membar	#Sync
+	retry
+#else
+	set	ce_trap_tl1, %g1
+	sethi	%hi(dis_err_panic1), %g4
+	jmp	%g4 + %lo(dis_err_panic1)
+	nop
+#endif
+	SET_SIZE(ce_err_tl1)
+
+#ifdef	TRAPTRACE
+.celevel1msg:
+	.asciz	"Softerror with trap tracing at tl1: AFAR 0x%08x.%08x AFSR 0x%08x.%08x";
+
+	ENTRY_NP(ce_trap_tl1)
+	! upper 32 bits of AFSR already in o3
+	mov	%o4, %o0		! save AFAR upper 32 bits
+	mov	%o2, %o4		! lower 32 bits of AFSR
+	mov	%o1, %o2		! lower 32 bits of AFAR
+	mov	%o0, %o1		! upper 32 bits of AFAR
+	set	.celevel1msg, %o0
+	call	panic
+	nop
+	SET_SIZE(ce_trap_tl1)
+#endif
+
+	!
+	! async_err is the assembly glue code to get us from the actual trap
+	! into the CPU module's C error handler.  Note that we also branch
+	! here from ce_err() above.
+	!
+	ENTRY_NP(async_err)
+	stxa	%g0, [%g0]ASI_ESTATE_ERR ! disable ecc and other cpu errors
+	membar	#Sync			! membar sync required
+
+	ldxa	[%g0]ASI_AFSR, %g3	! save afsr in g3
+	ldxa	[%g0]ASI_AFAR, %g2	! save afar in g2
+
+	sllx	%g5, 53, %g5		! move ttype to <63:53>
+	or	%g3, %g5, %g3		! or to afsr in g3
+
+	or	%g0, 1, %g1		! put 1 in g1
+	sllx	%g1, 21, %g1		! shift left to <21> afsr UE
+	andcc	%g1, %g3, %g0		! check for UE in afsr
+	bz,a,pn %icc, 2f		! if !UE skip sdb read/clear
+	  nop
+
+	set	P_DER_H, %g4		! put P_DER_H in g4
+	ldxa	[%g4]ASI_SDB_INTR_R, %g5 ! read sdb upper half into 56
+	or	%g0, 1, %g6		! put 1 in g6
+	sllx	%g6, 9, %g6		! shift g6 to <9> sdb UE
+	andcc	%g5, %g6, %g1		! check for UE in upper half
+	sllx	%g5, 33, %g5		! shift upper bits to <42:33>
+	or	%g3, %g5, %g3		! or with afsr bits
+	bz,a	1f			! no error, goto 1f
+	  nop
+	stxa	%g1, [%g4]ASI_SDB_INTR_W ! clear sdb reg UE error bit
+	membar	#Sync			! membar sync required
+1:
+	set	P_DER_L, %g4		! put P_DER_L in g4
+	ldxa	[%g4]ASI_SDB_INTR_R, %g5 ! read sdb lower half into g5
+	andcc	%g5, %g6, %g1		! check for UE in lower half
+	sllx	%g5, 43, %g5		! shift upper bits to <52:43>
+	or	%g3, %g5, %g3		! or with afsr bits
+	bz,a	2f			! no error, goto 2f
+	  nop
+	stxa	%g1, [%g4]ASI_SDB_INTR_W ! clear sdb reg UE error bit
+	membar	#Sync			! membar sync required
+2:
+	stxa	%g3, [%g0]ASI_AFSR	! clear all the sticky bits
+	membar	#Sync			! membar sync required
+
+	set	cpu_async_error, %g1	! put cpu_async_error in g1
+	sethi	%hi(sys_trap), %g5
+	jmp	%g5 + %lo(sys_trap)	! goto sys_trap
+	  or	%g0, PIL_15, %g4	! run at pil 15
+	SET_SIZE(async_err)
+
+	ENTRY_NP(dis_err_panic1)
+	stxa	%g0, [%g0]ASI_ESTATE_ERR ! disable all error traps
+	membar	#Sync
+	! save destination routine is in g1
+	ldxa	[%g0]ASI_AFAR, %g2	! read afar
+	ldxa	[%g0]ASI_AFSR, %g3	! read afsr
+	set	P_DER_H, %g4		! put P_DER_H in g4
+	ldxa	[%g4]ASI_SDB_INTR_R, %g5 ! read sdb upper half into g5
+	sllx	%g5, 33, %g5		! shift upper bits to <42:33>
+	or	%g3, %g5, %g3		! or with afsr bits
+	set	P_DER_L, %g4		! put P_DER_L in g4
+	ldxa	[%g4]ASI_SDB_INTR_R, %g5 ! read sdb lower half into g5
+	sllx	%g5, 43, %g5		! shift upper bits to <52:43>
+	or	%g3, %g5, %g3		! or with afsr bits
+	sethi	%hi(sys_trap), %g5
+	jmp	%g5 + %lo(sys_trap)	! goto sys_trap
+	  sub	%g0, 1, %g4
+	SET_SIZE(dis_err_panic1)
+
+	ENTRY(clr_datapath)
+	set	P_DER_H, %o4			! put P_DER_H in o4
+	ldxa	[%o4]ASI_SDB_INTR_R, %o5	! read sdb upper half into o3
+	or	%g0, 0x3, %o2			! put 0x3 in o2
+	sllx	%o2, 8, %o2			! shift o2 to <9:8> sdb
+	andcc	%o5, %o2, %o1			! check for UE,CE in upper half
+	bz,a	1f				! no error, goto 1f
+	  nop
+	stxa	%o1, [%o4]ASI_SDB_INTR_W	! clear sdb reg UE,CE error bits
+	membar	#Sync				! membar sync required
+1:
+	set	P_DER_L, %o4			! put P_DER_L in o4
+	ldxa	[%o4]ASI_SDB_INTR_R, %o5	! read sdb lower half into o5
+	andcc	%o5, %o2, %o1			! check for UE,CE in lower half
+	bz,a	2f				! no error, goto 2f
+	  nop
+	stxa	%o1, [%o4]ASI_SDB_INTR_W	! clear sdb reg UE,CE error bits
+	membar	#Sync
+2:
+	retl
+	  nop
+	SET_SIZE(clr_datapath)
+
+	ENTRY(get_udb_errors)
+	set	P_DER_H, %o3
+	ldxa	[%o3]ASI_SDB_INTR_R, %o2
+	stx	%o2, [%o0]
+	set	P_DER_L, %o3
+	ldxa	[%o3]ASI_SDB_INTR_R, %o2
+	retl
+	  stx	%o2, [%o1]
+	SET_SIZE(get_udb_errors)
+
+#endif /* lint */
+
+#if defined(lint)
+/*
+ * The itlb_rd_entry and dtlb_rd_entry functions return the tag portion of the
+ * tte, the virtual address, and the ctxnum of the specified tlb entry.  They
+ * should only be used in places where you have no choice but to look at the
+ * tlb itself.
+ *
+ * Note: These two routines are required by the Estar "cpr" loadable module.
+ */
+/*ARGSUSED*/
+void
+itlb_rd_entry(uint_t entry, tte_t *tte, uint64_t *va_tag)
+{}
+
+/*ARGSUSED*/
+void
+dtlb_rd_entry(uint_t entry, tte_t *tte, uint64_t *va_tag)
+{}
+#else 	/* lint */
+/*
+ * NB - In Spitfire cpus, when reading a tte from the hardware, we
+ * need to clear [42-41] because the general definitions in pte.h
+ * define the PA to be [42-13] whereas Spitfire really uses [40-13].
+ * When cloning these routines for other cpus the "andn" below is not
+ * necessary.
+ */
+	ENTRY_NP(itlb_rd_entry)
+	sllx	%o0, 3, %o0
+#if defined(SF_ERRATA_32)
+	sethi	%hi(FLUSH_ADDR), %g2
+	set	MMU_PCONTEXT, %g1
+	stxa	%g0, [%g1]ASI_DMMU			! KCONTEXT
+	flush	%g2
+#endif
+	ldxa	[%o0]ASI_ITLB_ACCESS, %g1
+	set	TTE_SPITFIRE_PFNHI_CLEAR, %g2		! spitfire only
+	sllx	%g2, TTE_SPITFIRE_PFNHI_SHIFT, %g2	! see comment above
+	andn	%g1, %g2, %g1				! for details
+	stx	%g1, [%o1]
+	ldxa	[%o0]ASI_ITLB_TAGREAD, %g2
+	set	TAGREAD_CTX_MASK, %o4
+	andn	%g2, %o4, %o5
+	retl
+	  stx	%o5, [%o2]
+	SET_SIZE(itlb_rd_entry)
+
+	ENTRY_NP(dtlb_rd_entry)
+	sllx	%o0, 3, %o0
+#if defined(SF_ERRATA_32)
+	sethi	%hi(FLUSH_ADDR), %g2
+	set	MMU_PCONTEXT, %g1
+	stxa	%g0, [%g1]ASI_DMMU			! KCONTEXT
+	flush	%g2
+#endif
+	ldxa	[%o0]ASI_DTLB_ACCESS, %g1
+	set	TTE_SPITFIRE_PFNHI_CLEAR, %g2		! spitfire only
+	sllx	%g2, TTE_SPITFIRE_PFNHI_SHIFT, %g2	! see comment above
+	andn	%g1, %g2, %g1				! itlb_rd_entry
+	stx	%g1, [%o1]
+	ldxa	[%o0]ASI_DTLB_TAGREAD, %g2
+	set	TAGREAD_CTX_MASK, %o4
+	andn	%g2, %o4, %o5
+	retl
+	  stx	%o5, [%o2]
+	SET_SIZE(dtlb_rd_entry)
+#endif /* lint */
+
+#if defined(lint)
+
+/*
+ * routines to get and set the LSU register
+ */
+uint64_t
+get_lsu(void)
+{
+	return ((uint64_t)0);
+}
+
+/*ARGSUSED*/
+void
+set_lsu(uint64_t lsu)
+{}
+
+#else /* lint */
+
+	ENTRY(set_lsu)
+	stxa	%o0, [%g0]ASI_LSU		! store to LSU
+	retl
+	membar	#Sync
+	SET_SIZE(set_lsu)
+
+	ENTRY(get_lsu)
+	retl
+	ldxa	[%g0]ASI_LSU, %o0		! load LSU
+	SET_SIZE(get_lsu)
+
+#endif /* lint */
+
+#ifndef lint
+	/*
+	 * Clear the NPT (non-privileged trap) bit in the %tick
+	 * registers. In an effort to make the change in the
+	 * tick counter as consistent as possible, we disable
+	 * all interrupts while we're changing the registers. We also
+	 * ensure that the read and write instructions are in the same
+	 * line in the instruction cache.
+	 */
+	ENTRY_NP(cpu_clearticknpt)
+	rdpr	%pstate, %g1		/* save processor state */
+	andn	%g1, PSTATE_IE, %g3	/* turn off */
+	wrpr	%g0, %g3, %pstate	/*   interrupts */
+	rdpr	%tick, %g2		/* get tick register */
+	brgez,pn %g2, 1f		/* if NPT bit off, we're done */
+	mov	1, %g3			/* create mask */
+	sllx	%g3, 63, %g3		/*   for NPT bit */
+	ba,a,pt	%xcc, 2f
+	.align	64			/* Align to I$ boundary */
+2:
+	rdpr	%tick, %g2		/* get tick register */
+	wrpr	%g3, %g2, %tick		/* write tick register, */
+					/*   clearing NPT bit   */
+#if defined(BB_ERRATA_1)
+	rdpr	%tick, %g0		/* read (s)tick (BB_ERRATA_1) */
+#endif
+1:
+	jmp	%g4 + 4
+	wrpr	%g0, %g1, %pstate	/* restore processor state */
+	SET_SIZE(cpu_clearticknpt)
+
+	/*
+	 * get_ecache_tag()
+	 * Register Usage:
+	 * %o0: In: 32-bit E$ index
+	 *      Out: 64-bit E$ tag value
+	 * %o1: In: 64-bit AFSR value after clearing sticky bits
+	 * %o2: In: address of cpu private afsr storage
+	 */
+	ENTRY(get_ecache_tag)
+	or	%g0, 1, %o4
+	sllx	%o4, 40, %o4			! set bit 40 for e$ tag access
+	or	%o0, %o4, %o4			! %o4 = e$ addr for tag read
+	rdpr	%pstate, %o5
+	andn	%o5, PSTATE_IE | PSTATE_AM, %o0
+	wrpr	%o0, %g0, %pstate		! clear IE, AM bits
+
+	ldxa	[%g0]ASI_ESTATE_ERR, %g1
+	stxa	%g0, [%g0]ASI_ESTATE_ERR	! Turn off Error enable
+	membar	#Sync
+
+	ldxa	[%g0]ASI_AFSR, %o0
+	srlx	%o0, P_AFSR_CP_SHIFT, %o3
+	btst	1, %o3
+	bz	1f
+	  nop
+	ldx	[%o2], %g4
+	or	%g4, %o0, %g4			! aggregate AFSR in cpu private
+	stx	%g4, [%o2]
+1:
+	stxa	%o0, [%g0]ASI_AFSR		! clear AFSR
+	membar  #Sync
+
+	ldxa	[%o4]ASI_EC_R, %g0
+	ldxa	[%g0]ASI_EC_DIAG, %o0		! read tag from e$ tag reg
+
+	ldxa	[%g0]ASI_AFSR, %o3
+	srlx	%o3, P_AFSR_CP_SHIFT, %o4
+	btst	1, %o4
+	bz	2f
+	  stx	%o3, [%o1]			! AFSR after sticky clear
+	ldx	[%o2], %g4
+	or	%g4, %o3, %g4			! aggregate AFSR in cpu private
+	stx	%g4, [%o2]
+2:
+	membar	#Sync
+
+	stxa	%g1, [%g0]ASI_ESTATE_ERR	! Turn error enable back on
+	membar	#Sync
+	retl
+	wrpr	%g0, %o5, %pstate
+	SET_SIZE(get_ecache_tag)
+
+	/*
+	 * check_ecache_line()
+	 * Register Usage:
+	 * %o0: In: 32-bit E$ index
+	 *      Out: 64-bit accumulated AFSR
+	 * %o1: In: address of cpu private afsr storage
+	 */
+	ENTRY(check_ecache_line)
+	or	%g0, 1, %o4
+	sllx	%o4, 39, %o4			! set bit 39 for e$ data access
+	or	%o0, %o4, %o4		 	! %o4 = e$ addr for data read
+
+	rdpr	%pstate, %o5
+	andn	%o5, PSTATE_IE | PSTATE_AM, %o0
+	wrpr	%o0, %g0, %pstate		! clear IE, AM bits
+
+	ldxa	[%g0]ASI_ESTATE_ERR, %g1
+	stxa	%g0, [%g0]ASI_ESTATE_ERR 	! Turn off Error enable
+	membar	#Sync
+
+	ldxa 	[%g0]ASI_AFSR, %o0
+	srlx	%o0, P_AFSR_CP_SHIFT, %o2
+	btst	1, %o2
+	bz	1f
+	  clr	%o2				! loop count
+	ldx	[%o1], %o3
+	or	%o3, %o0, %o3			! aggregate AFSR in cpu private
+	stx	%o3, [%o1]
+1: 
+	stxa    %o0, [%g0]ASI_AFSR              ! clear AFSR
+	membar	#Sync
+
+2:
+	ldxa	[%o4]ASI_EC_R, %g0		! Read the E$ data 8bytes each
+	add	%o2, 1, %o2
+	cmp	%o2, 8
+	bl,a 	2b
+	  add	%o4, 8, %o4
+
+	membar	#Sync
+	ldxa	[%g0]ASI_AFSR, %o0		! read accumulated AFSR
+	srlx	%o0, P_AFSR_CP_SHIFT, %o2
+	btst	1, %o2
+	bz	3f
+	  nop
+	ldx	[%o1], %o3
+	or	%o3, %o0, %o3			! aggregate AFSR in cpu private
+	stx	%o3, [%o1]
+3:
+	stxa	%o0, [%g0]ASI_AFSR		! clear AFSR
+	membar	#Sync
+	stxa	%g1, [%g0]ASI_ESTATE_ERR	! Turn error enable back on
+	membar	#Sync
+	retl
+	wrpr	%g0, %o5, %pstate
+	SET_SIZE(check_ecache_line)
+#endif /* lint */
+
+#if defined(lint)
+uint64_t
+read_and_clear_afsr()
+{
+	return ((uint64_t)0);
+}
+#else	/* lint */
+	ENTRY(read_and_clear_afsr)
+	ldxa	[%g0]ASI_AFSR, %o0
+	retl
+	  stxa	%o0, [%g0]ASI_AFSR		! clear AFSR
+	SET_SIZE(read_and_clear_afsr)
+#endif	/* lint */
+
+#if defined(lint)
+/* ARGSUSED */
+void
+scrubphys(uint64_t paddr, int ecache_size)
+{
+}
+
+#else	/* lint */
+
+/*
+ * scrubphys - Pass in the aligned physical memory address that you want
+ * to scrub, along with the ecache size.
+ *
+ *	1) Displacement flush the E$ line corresponding to %addr.
+ *	   The first ldxa guarantees that the %addr is no longer in
+ *	   M, O, or E (goes to I or S (if instruction fetch also happens).
+ *	2) "Write" the data using a CAS %addr,%g0,%g0.
+ *	   The casxa guarantees a transition from I to M or S to M.
+ *	3) Displacement flush the E$ line corresponding to %addr.
+ *	   The second ldxa pushes the M line out of the ecache, into the
+ *	   writeback buffers, on the way to memory.
+ *	4) The "membar #Sync" pushes the cache line out of the writeback
+ *	   buffers onto the bus, on the way to dram finally.
+ *
+ * This is a modified version of the algorithm suggested by Gary Lauterbach.
+ * In theory the CAS %addr,%g0,%g0 is supposed to mark the addr's cache line
+ * as modified, but then we found out that for spitfire, if it misses in the
+ * E$ it will probably install as an M, but if it hits in the E$, then it
+ * will stay E, if the store doesn't happen. So the first displacement flush
+ * should ensure that the CAS will miss in the E$.  Arrgh.
+ */
+
+	ENTRY(scrubphys)
+	or	%o1, %g0, %o2	! put ecache size in %o2
+#ifndef HUMMINGBIRD
+	xor	%o0, %o2, %o1	! calculate alias address
+	add	%o2, %o2, %o3	! 2 * ecachesize in case
+				! addr == ecache_flushaddr
+	sub	%o3, 1, %o3	! -1 == mask
+	and	%o1, %o3, %o1	! and with xor'd address
+	set	ecache_flushaddr, %o3
+	ldx	[%o3], %o3
+
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate	! clear IE, AM bits
+
+	ldxa	[%o1 + %o3]ASI_MEM, %g0 ! load ecache_flushaddr + alias
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	ldxa	[%o1 + %o3]ASI_MEM, %g0	! load ecache_flushaddr + alias
+
+#else /* HUMMINGBIRD */
+	/*
+	 * UltraSPARC-IIe processor supports both 4-way set associative
+	 * and direct map E$. We need to reconfigure E$ to direct map
+	 * mode for data load/store before displacement flush. Also, we
+	 * need to flush all 4 sets of the E$ to ensure that the physaddr
+	 * has been flushed. Keep the interrupts disabled while flushing
+	 * E$ in this manner.
+	 *
+	 * For flushing a specific physical address, we start at the
+	 * aliased address and load at set-size stride, wrapping around
+	 * at 2*ecache-size boundary and skipping fault physical address.
+	 * It takes 10 loads to guarantee that the physical address has
+	 * been flushed.
+	 *
+	 * Usage:
+	 *	%o0	physaddr
+	 *	%o5	physaddr - ecache_flushaddr
+	 *	%g1	UPA config (restored later)
+	 *	%g2	E$ set size
+	 *	%g3	E$ flush address range mask (i.e. 2 * E$ -1)
+	 *	%g4	#loads to flush phys address
+	 *	%g5	temp 
+	 */
+
+	sethi	%hi(ecache_associativity), %g5
+	ld	[%g5 + %lo(ecache_associativity)], %g5
+	udivx	%o2, %g5, %g2	! set size (i.e. ecache_size/#sets)
+	xor	%o0, %o2, %o1	! calculate alias address
+	add	%o2, %o2, %g3	! 2 * ecachesize in case
+				! addr == ecache_flushaddr
+	sub	%g3, 1, %g3	! 2 * ecachesize -1 == mask
+	and	%o1, %g3, %o1	! and with xor'd address
+	sethi	%hi(ecache_flushaddr), %o3
+	ldx	[%o3 + %lo(ecache_flushaddr)], %o3
+
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate	! clear IE, AM bits
+
+	! Place E$ in direct map mode for data access
+	or	%g0, 1, %g5
+	sllx	%g5, HB_UPA_DMAP_DATA_BIT, %g5
+	ldxa	[%g0]ASI_UPA_CONFIG, %g1 ! current UPA config (restored later)
+	or	%g1, %g5, %g5
+	membar	#Sync
+	stxa	%g5, [%g0]ASI_UPA_CONFIG ! enable direct map for data access
+	membar	#Sync
+
+	! Displace cache line from each set of E$ starting at the
+	! aliased address. at set-size stride, wrapping at 2*ecache_size
+	! and skipping load from physaddr. We need 10 loads to flush the
+	! physaddr from E$.
+	mov	HB_PHYS_FLUSH_CNT-1, %g4 ! #loads to flush phys addr
+	sub	%o0, %o3, %o5		! physaddr - ecache_flushaddr
+	or	%o1, %g0, %g5		! starting aliased offset
+2:
+	ldxa	[%g5 + %o3]ASI_MEM, %g0	! load ecache_flushaddr + alias
+1:
+	add	%g5, %g2, %g5		! calculate offset in next set
+	and	%g5, %g3, %g5		! force offset within aliased range
+	cmp	%g5, %o5		! skip loads from physaddr
+	be,pn %ncc, 1b
+	  nop
+	brgz,pt	%g4, 2b
+	  dec	%g4
+
+	casxa	[%o0]ASI_MEM, %g0, %g0
+
+	! Flush %o0 from ecahe again.
+	! Need single displacement flush at offset %o1 this time as
+	! the E$ is already in direct map mode.
+	ldxa	[%o1 + %o3]ASI_MEM, %g0	! load ecache_flushaddr + alias
+
+	membar	#Sync
+	stxa	%g1, [%g0]ASI_UPA_CONFIG ! restore UPA config (DM bits)
+	membar	#Sync
+#endif /* HUMMINGBIRD */
+	wrpr	%g0, %o4, %pstate	! restore earlier pstate register value
+
+	retl
+	membar	#Sync			! move the data out of the load buffer
+	SET_SIZE(scrubphys)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+/*
+ * clearphys - Pass in the aligned physical memory address that you want
+ * to push out, as a 64 byte block of zeros, from the ecache zero-filled.
+ * Since this routine does not bypass the ecache, it is possible that
+ * it could generate a UE error while trying to clear the a bad line.
+ * This routine clears and restores the error enable flag.
+ * TBD - Hummingbird may need similar protection
+ */
+/* ARGSUSED */
+void
+clearphys(uint64_t paddr, int ecache_size, int ecache_linesize)
+{
+}
+
+#else	/* lint */
+
+	ENTRY(clearphys)
+	or	%o2, %g0, %o3	! ecache linesize
+	or	%o1, %g0, %o2	! ecache size
+#ifndef HUMMINGBIRD
+	or	%o3, %g0, %o4	! save ecache linesize
+	xor	%o0, %o2, %o1	! calculate alias address
+	add	%o2, %o2, %o3	! 2 * ecachesize
+	sub	%o3, 1, %o3	! -1 == mask
+	and	%o1, %o3, %o1	! and with xor'd address
+	set	ecache_flushaddr, %o3
+	ldx	[%o3], %o3
+	or	%o4, %g0, %o2	! saved ecache linesize
+
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate	! clear IE, AM bits
+
+	ldxa	[%g0]ASI_ESTATE_ERR, %g1
+	stxa	%g0, [%g0]ASI_ESTATE_ERR	! disable errors
+	membar	#Sync
+
+	! need to put zeros in the cache line before displacing it
+
+	sub	%o2, 8, %o2	! get offset of last double word in ecache line
+1:
+	stxa	%g0, [%o0 + %o2]ASI_MEM	! put zeros in the ecache line
+	sub	%o2, 8, %o2
+	brgez,a,pt %o2, 1b
+	nop
+	ldxa	[%o1 + %o3]ASI_MEM, %g0	! load ecache_flushaddr + alias
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	ldxa	[%o1 + %o3]ASI_MEM, %g0	! load ecache_flushaddr + alias
+
+	stxa	%g1, [%g0]ASI_ESTATE_ERR	! restore error enable
+	membar	#Sync
+
+#else /* HUMMINGBIRD... */
+	/*
+	 * UltraSPARC-IIe processor supports both 4-way set associative
+	 * and direct map E$. We need to reconfigure E$ to direct map
+	 * mode for data load/store before displacement flush. Also, we
+	 * need to flush all 4 sets of the E$ to ensure that the physaddr
+	 * has been flushed. Keep the interrupts disabled while flushing
+	 * E$ in this manner.
+	 *
+	 * For flushing a specific physical address, we start at the
+	 * aliased address and load at set-size stride, wrapping around
+	 * at 2*ecache-size boundary and skipping fault physical address.
+	 * It takes 10 loads to guarantee that the physical address has
+	 * been flushed.
+	 *
+	 * Usage:
+	 *	%o0	physaddr
+	 *	%o5	physaddr - ecache_flushaddr
+	 *	%g1	UPA config (restored later)
+	 *	%g2	E$ set size
+	 *	%g3	E$ flush address range mask (i.e. 2 * E$ -1)
+	 *	%g4	#loads to flush phys address
+	 *	%g5	temp 
+	 */
+
+	or	%o3, %g0, %o4	! save ecache linesize
+	sethi	%hi(ecache_associativity), %g5
+	ld	[%g5 + %lo(ecache_associativity)], %g5
+	udivx	%o2, %g5, %g2	! set size (i.e. ecache_size/#sets)
+
+	xor	%o0, %o2, %o1	! calculate alias address
+	add	%o2, %o2, %g3	! 2 * ecachesize
+	sub	%g3, 1, %g3	! 2 * ecachesize -1 == mask
+	and	%o1, %g3, %o1	! and with xor'd address
+	sethi	%hi(ecache_flushaddr), %o3
+	ldx	[%o3 +%lo(ecache_flushaddr)], %o3
+	or	%o4, %g0, %o2	! saved ecache linesize
+
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate	! clear IE, AM bits
+
+	! Place E$ in direct map mode for data access
+	or	%g0, 1, %g5
+	sllx	%g5, HB_UPA_DMAP_DATA_BIT, %g5
+	ldxa	[%g0]ASI_UPA_CONFIG, %g1 ! current UPA config (restored later)
+	or	%g1, %g5, %g5
+	membar	#Sync
+	stxa	%g5, [%g0]ASI_UPA_CONFIG ! enable direct map for data access
+	membar	#Sync
+
+	! need to put zeros in the cache line before displacing it
+
+	sub	%o2, 8, %o2	! get offset of last double word in ecache line
+1:
+	stxa	%g0, [%o0 + %o2]ASI_MEM	! put zeros in the ecache line
+	sub	%o2, 8, %o2
+	brgez,a,pt %o2, 1b
+	nop
+
+	! Displace cache line from each set of E$ starting at the
+	! aliased address. at set-size stride, wrapping at 2*ecache_size
+	! and skipping load from physaddr. We need 10 loads to flush the
+	! physaddr from E$.
+	mov	HB_PHYS_FLUSH_CNT-1, %g4 ! #loads to flush phys addr
+	sub	%o0, %o3, %o5		! physaddr - ecache_flushaddr
+	or	%o1, %g0, %g5		! starting offset
+2:
+	ldxa	[%g5 + %o3]ASI_MEM, %g0	! load ecache_flushaddr + alias
+3:
+	add	%g5, %g2, %g5		! calculate offset in next set
+	and	%g5, %g3, %g5		! force offset within aliased range
+	cmp	%g5, %o5		! skip loads from physaddr
+	be,pn %ncc, 3b
+	  nop
+	brgz,pt	%g4, 2b
+	  dec	%g4
+
+	casxa	[%o0]ASI_MEM, %g0, %g0
+
+	! Flush %o0 from ecahe again.
+	! Need single displacement flush at offset %o1 this time as
+	! the E$ is already in direct map mode.
+	ldxa	[%o1 + %o3]ASI_MEM, %g0	! load ecache_flushaddr + alias
+
+	membar	#Sync
+	stxa	%g1, [%g0]ASI_UPA_CONFIG ! restore UPA config (DM bits)
+	membar	#Sync
+#endif /* HUMMINGBIRD... */
+
+	retl
+	wrpr	%g0, %o4, %pstate	! restore earlier pstate register value
+	SET_SIZE(clearphys)
+
+#endif	/* lint */
+
+#if defined(lint)
+/* ARGSUSED */
+void
+flushecacheline(uint64_t paddr, int ecache_size)
+{
+}
+
+#else	/* lint */
+/*
+ * flushecacheline - This is a simpler version of scrubphys
+ * which simply does a displacement flush of the line in
+ * question. This routine is mainly used in handling async
+ * errors where we want to get rid of a bad line in ecache.
+ * Note that if the line is modified and it has suffered
+ * data corruption - we are guarantee that the hw will write
+ * a UE back to mark the page poisoned.
+ */
+        ENTRY(flushecacheline)
+        or      %o1, %g0, %o2   ! put ecache size in %o2
+#ifndef HUMMINGBIRD
+        xor     %o0, %o2, %o1   ! calculate alias address
+        add     %o2, %o2, %o3   ! 2 * ecachesize in case
+                                ! addr == ecache_flushaddr
+        sub     %o3, 1, %o3     ! -1 == mask
+        and     %o1, %o3, %o1   ! and with xor'd address
+        set     ecache_flushaddr, %o3
+        ldx     [%o3], %o3
+
+        rdpr    %pstate, %o4
+        andn    %o4, PSTATE_IE | PSTATE_AM, %o5
+        wrpr    %o5, %g0, %pstate       ! clear IE, AM bits
+
+	ldxa	[%g0]ASI_ESTATE_ERR, %g1
+	stxa	%g0, [%g0]ASI_ESTATE_ERR	! disable errors
+	membar	#Sync
+
+        ldxa    [%o1 + %o3]ASI_MEM, %g0 ! load ecache_flushaddr + alias
+	membar	#Sync
+	stxa	%g1, [%g0]ASI_ESTATE_ERR	! restore error enable
+        membar  #Sync                   
+#else /* HUMMINGBIRD */
+	/*
+	 * UltraSPARC-IIe processor supports both 4-way set associative
+	 * and direct map E$. We need to reconfigure E$ to direct map
+	 * mode for data load/store before displacement flush. Also, we
+	 * need to flush all 4 sets of the E$ to ensure that the physaddr
+	 * has been flushed. Keep the interrupts disabled while flushing
+	 * E$ in this manner.
+	 *
+	 * For flushing a specific physical address, we start at the
+	 * aliased address and load at set-size stride, wrapping around
+	 * at 2*ecache-size boundary and skipping fault physical address.
+	 * It takes 10 loads to guarantee that the physical address has
+	 * been flushed.
+	 *
+	 * Usage:
+	 *	%o0	physaddr
+	 *	%o5	physaddr - ecache_flushaddr
+	 *	%g1	error enable register
+	 *	%g2	E$ set size
+	 *	%g3	E$ flush address range mask (i.e. 2 * E$ -1)
+	 *	%g4	UPA config (restored later)
+	 *	%g5	temp 
+	 */
+
+	sethi	%hi(ecache_associativity), %g5
+	ld	[%g5 + %lo(ecache_associativity)], %g5
+	udivx	%o2, %g5, %g2	! set size (i.e. ecache_size/#sets)
+	xor	%o0, %o2, %o1	! calculate alias address
+	add	%o2, %o2, %g3	! 2 * ecachesize in case
+				! addr == ecache_flushaddr
+	sub	%g3, 1, %g3	! 2 * ecachesize -1 == mask
+	and	%o1, %g3, %o1	! and with xor'd address
+	sethi	%hi(ecache_flushaddr), %o3
+	ldx	[%o3 + %lo(ecache_flushaddr)], %o3
+
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate	! clear IE, AM bits
+
+	! Place E$ in direct map mode for data access
+	or	%g0, 1, %g5
+	sllx	%g5, HB_UPA_DMAP_DATA_BIT, %g5
+	ldxa	[%g0]ASI_UPA_CONFIG, %g4 ! current UPA config (restored later)
+	or	%g4, %g5, %g5
+	membar	#Sync
+	stxa	%g5, [%g0]ASI_UPA_CONFIG ! enable direct map for data access
+	membar	#Sync
+
+	ldxa	[%g0]ASI_ESTATE_ERR, %g1
+	stxa	%g0, [%g0]ASI_ESTATE_ERR	! disable errors
+	membar	#Sync
+
+	! Displace cache line from each set of E$ starting at the
+	! aliased address. at set-size stride, wrapping at 2*ecache_size
+	! and skipping load from physaddr. We need 10 loads to flush the
+	! physaddr from E$.
+	mov	HB_PHYS_FLUSH_CNT-1, %g5 ! #loads to flush physaddr
+	sub	%o0, %o3, %o5		! physaddr - ecache_flushaddr
+2:
+	ldxa	[%o1 + %o3]ASI_MEM, %g0	! load ecache_flushaddr + alias
+3:
+	add	%o1, %g2, %o1		! calculate offset in next set
+	and	%o1, %g3, %o1		! force offset within aliased range
+	cmp	%o1, %o5		! skip loads from physaddr
+	be,pn %ncc, 3b
+	  nop
+	brgz,pt	%g5, 2b
+	  dec	%g5
+	
+	membar	#Sync
+	stxa	%g1, [%g0]ASI_ESTATE_ERR	! restore error enable
+        membar  #Sync                   
+
+	stxa	%g4, [%g0]ASI_UPA_CONFIG ! restore UPA config (DM bits)
+	membar	#Sync
+#endif /* HUMMINGBIRD */
+        retl
+        wrpr    %g0, %o4, %pstate       
+        SET_SIZE(flushecacheline)
+
+#endif	/* lint */
+
+#if defined(lint)
+/* ARGSUSED */
+void
+ecache_scrubreq_tl1(uint64_t inum, uint64_t dummy)
+{
+}
+
+#else	/* lint */
+/*
+ * ecache_scrubreq_tl1 is the crosstrap handler called at ecache_calls_a_sec Hz
+ * from the clock CPU.  It atomically increments the outstanding request
+ * counter and, if there was not already an outstanding request,
+ * branches to setsoftint_tl1 to enqueue an intr_req for the given inum.
+ */
+
+	! Register usage:
+	!
+	! Arguments:
+	! %g1 - inum
+	!
+	! Internal:
+	! %g2, %g3, %g5 - scratch
+	! %g4 - ptr. to spitfire_scrub_misc ec_scrub_outstanding.
+	! %g6 - setsoftint_tl1 address
+
+	ENTRY_NP(ecache_scrubreq_tl1)
+	set	SFPR_SCRUB_MISC + EC_SCRUB_OUTSTANDING, %g2
+	GET_CPU_PRIVATE_PTR(%g2, %g4, %g5, 1f);
+	ld	[%g4], %g2		! cpu's ec_scrub_outstanding.
+	set	setsoftint_tl1, %g6
+	!
+	! no need to use atomic instructions for the following
+	! increment - we're at tl1
+	!
+	add	%g2, 0x1, %g3
+	brnz,pn	%g2, 1f			! no need to enqueue more intr_req
+	  st	%g3, [%g4]		! delay - store incremented counter
+	jmp	%g6			! setsoftint_tl1(%g1) - queue intr_req
+	  nop
+	! not reached
+1:
+	retry
+	SET_SIZE(ecache_scrubreq_tl1)
+
+#endif	/* lint */
+
+#if defined(lint)
+/*ARGSUSED*/
+void
+write_ec_tag_parity(uint32_t id)
+{}
+#else /* lint */
+
+	/*
+         * write_ec_tag_parity(), which zero's the ecache tag,
+         * marks the state as invalid and writes good parity to the tag.
+         * Input %o1= 32 bit E$ index
+         */
+        ENTRY(write_ec_tag_parity)
+        or      %g0, 1, %o4
+        sllx    %o4, 39, %o4                    ! set bit 40 for e$ tag access
+        or      %o0, %o4, %o4                 ! %o4 = ecache addr for tag write
+
+        rdpr    %pstate, %o5
+        andn    %o5, PSTATE_IE | PSTATE_AM, %o1
+        wrpr    %o1, %g0, %pstate               ! clear IE, AM bits
+
+        ldxa    [%g0]ASI_ESTATE_ERR, %g1
+        stxa    %g0, [%g0]ASI_ESTATE_ERR        ! Turn off Error enable
+        membar  #Sync
+
+        ba      1f
+         nop
+	/*
+         * Align on the ecache boundary in order to force
+         * ciritical code section onto the same ecache line.
+         */
+         .align 64
+
+1:
+        set     S_EC_PARITY, %o3         	! clear tag, state invalid
+        sllx    %o3, S_ECPAR_SHIFT, %o3   	! and with good tag parity
+        stxa    %o3, [%g0]ASI_EC_DIAG           ! update with the above info
+        stxa    %g0, [%o4]ASI_EC_W
+        membar  #Sync
+
+        stxa    %g1, [%g0]ASI_ESTATE_ERR        ! Turn error enable back on
+        membar  #Sync
+        retl
+        wrpr    %g0, %o5, %pstate
+        SET_SIZE(write_ec_tag_parity)
+
+#endif /* lint */
+
+#if defined(lint)
+/*ARGSUSED*/
+void
+write_hb_ec_tag_parity(uint32_t id)
+{}
+#else /* lint */
+
+	/*
+         * write_hb_ec_tag_parity(), which zero's the ecache tag,
+         * marks the state as invalid and writes good parity to the tag.
+         * Input %o1= 32 bit E$ index
+         */
+        ENTRY(write_hb_ec_tag_parity)
+        or      %g0, 1, %o4
+        sllx    %o4, 39, %o4                    ! set bit 40 for e$ tag access
+        or      %o0, %o4, %o4               ! %o4 = ecache addr for tag write
+
+        rdpr    %pstate, %o5
+        andn    %o5, PSTATE_IE | PSTATE_AM, %o1
+        wrpr    %o1, %g0, %pstate               ! clear IE, AM bits
+
+        ldxa    [%g0]ASI_ESTATE_ERR, %g1
+        stxa    %g0, [%g0]ASI_ESTATE_ERR        ! Turn off Error enable
+        membar  #Sync
+
+        ba      1f
+         nop
+	/*
+         * Align on the ecache boundary in order to force
+         * ciritical code section onto the same ecache line.
+         */
+         .align 64
+1: 
+#ifdef HUMMINGBIRD
+        set     HB_EC_PARITY, %o3         	! clear tag, state invalid
+        sllx    %o3, HB_ECPAR_SHIFT, %o3   	! and with good tag parity
+#else /* !HUMMINGBIRD */
+        set     SB_EC_PARITY, %o3         	! clear tag, state invalid
+        sllx    %o3, SB_ECPAR_SHIFT, %o3   	! and with good tag parity
+#endif /* !HUMMINGBIRD */
+
+        stxa    %o3, [%g0]ASI_EC_DIAG           ! update with the above info
+        stxa    %g0, [%o4]ASI_EC_W
+        membar  #Sync
+
+        stxa    %g1, [%g0]ASI_ESTATE_ERR        ! Turn error enable back on
+        membar  #Sync
+        retl
+        wrpr    %g0, %o5, %pstate
+        SET_SIZE(write_hb_ec_tag_parity)
+
+#endif /* lint */
+
+#define	VIS_BLOCKSIZE		64
+
+#if defined(lint)
+
+/*ARGSUSED*/
+int
+dtrace_blksuword32(uintptr_t addr, uint32_t *data, int tryagain)
+{ return (0); }
+
+#else
+
+	ENTRY(dtrace_blksuword32)
+	save	%sp, -SA(MINFRAME + 4), %sp
+
+	rdpr	%pstate, %l1
+	andn	%l1, PSTATE_IE, %l2		! disable interrupts to
+	wrpr	%g0, %l2, %pstate		! protect our FPU diddling
+
+	rd	%fprs, %l0
+	andcc	%l0, FPRS_FEF, %g0
+	bz,a,pt	%xcc, 1f			! if the fpu is disabled
+	wr	%g0, FPRS_FEF, %fprs		! ... enable the fpu
+
+	st	%f0, [%fp + STACK_BIAS - 4]	! save %f0 to the stack
+1:
+	set	0f, %l5
+        /*
+         * We're about to write a block full or either total garbage
+         * (not kernel data, don't worry) or user floating-point data
+         * (so it only _looks_ like garbage).
+         */
+	ld	[%i1], %f0			! modify the block
+	membar	#Sync
+	stn	%l5, [THREAD_REG + T_LOFAULT]	! set up the lofault handler
+	stda	%d0, [%i0]ASI_BLK_COMMIT_S	! store the modified block
+	membar	#Sync
+	stn	%g0, [THREAD_REG + T_LOFAULT]	! remove the lofault handler
+
+	bz,a,pt	%xcc, 1f
+	wr	%g0, %l0, %fprs			! restore %fprs
+
+	ld	[%fp + STACK_BIAS - 4], %f0	! restore %f0
+1:
+
+	wrpr	%g0, %l1, %pstate		! restore interrupts
+
+	ret
+	restore	%g0, %g0, %o0
+
+0:
+	membar	#Sync
+	stn	%g0, [THREAD_REG + T_LOFAULT]	! remove the lofault handler
+
+	bz,a,pt	%xcc, 1f
+	wr	%g0, %l0, %fprs			! restore %fprs
+
+	ld	[%fp + STACK_BIAS - 4], %f0	! restore %f0
+1:
+
+	wrpr	%g0, %l1, %pstate		! restore interrupts
+
+	/*
+	 * If tryagain is set (%i2) we tail-call dtrace_blksuword32_err()
+	 * which deals with watchpoints. Otherwise, just return -1.
+	 */
+	brnz,pt	%i2, 1f
+	nop
+	ret
+	restore	%g0, -1, %o0
+1:
+	call	dtrace_blksuword32_err
+	restore
+
+	SET_SIZE(dtrace_blksuword32)
+
+#endif /* lint */
diff --git a/usr/src/uts/sun4u/cpu/spitfire_copy.s b/usr/src/uts/sun4u/cpu/spitfire_copy.s
new file mode 100644
index 0000000000..adf2f0f307
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/spitfire_copy.s
@@ -0,0 +1,4939 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/asm_linkage.h>
+#include <sys/vtrace.h>
+#include <sys/machthread.h>
+#include <sys/clock.h>
+#include <sys/asi.h>
+#include <sys/fsr.h>
+#include <sys/privregs.h>
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* lint */
+
+
+/*
+ * Pseudo-code to aid in understanding the control flow of the
+ * bcopy routine.
+ *
+ * On entry to bcopy:
+ *
+ *	%l6 = curthread->t_lofault;
+ *	used_block_copy = FALSE;			! %l6 |= 1
+ *	if (%l6 != NULL) {
+ *		curthread->t_lofault = .copyerr;
+ *		caller_error_handler = TRUE		! %l6 |= 2
+ *	}
+ *
+ * 	if (length < VIS_COPY)
+ * 		goto regular_copy;
+ *
+ * 	if (!use_vis)
+ * 		goto_regular_copy;
+ *
+ * 	if (curthread->t_lwp == NULL) {
+ *		! Kernel threads do not have pcb's in which to store
+ *		! the floating point state, disallow preemption during
+ *		! the copy.
+ * 		kpreempt_disable(curthread);
+ *	}
+ *
+ * 	old_fprs = %fprs;
+ * 	old_gsr = %gsr;
+ * 	if (%fprs.fef) {
+ *              ! If we need to save 4 blocks of fpregs then make sure
+ *		! the length is still appropriate for that extra overhead.
+ * 		if (length < (large_length + (64 * 4))) {
+ * 			if (curthread->t_lwp == NULL)
+ * 				kpreempt_enable(curthread);
+ * 			goto regular_copy;
+ * 		}
+ * 		%fprs.fef = 1;
+ * 		save current fpregs on stack using blockstore
+ * 	} else {
+ * 		%fprs.fef = 1;
+ * 	}
+ *
+ * 	used_block_copy = 1;				! %l6 |= 1
+ * 	do_blockcopy_here;
+ *
+ * In lofault handler:
+ *	curthread->t_lofault = .copyerr2;
+ *	Continue on with the normal exit handler
+ *
+ * On exit:
+ *	call_kpreempt = 0;
+ * 	if (used_block_copy) {				! %l6 & 1
+ * 		%gsr = old_gsr;
+ * 		if (old_fprs & FPRS_FEF)
+ * 			restore fpregs from stack using blockload
+ *		else
+ *			zero fpregs
+ * 		%fprs = old_fprs;
+ * 		if (curthread->t_lwp == NULL) {
+ *			kpreempt_enable(curthread);
+ *			call_kpreempt = 1;
+ *		}
+ * 	}
+ * 	curthread->t_lofault = (%l6 & ~3);
+ *	if (call_kpreempt)
+ *		kpreempt(%pil);
+ * 	return (0)
+ *
+ * In second lofault handler (.copyerr2):
+ *	We've tried to restore fp state from the stack and failed.  To
+ *	prevent from returning with a corrupted fp state, we will panic.
+ */
+
+/*
+ * Notes on preserving existing fp state:
+ *
+ * When a copyOP decides to use fp we may have to preserve existing
+ * floating point state.  It is not the caller's state that we need to
+ * preserve - the rest of the kernel does not use fp and, anyway, fp
+ * registers are volatile across a call.  Some examples:
+ *
+ *	- userland has fp state and is interrupted (device interrupt
+ *	  or trap) and within the interrupt/trap handling we use
+ *	  bcopy()
+ *	- another (higher level) interrupt or trap handler uses bcopy
+ *	  while a bcopy from an earlier interrupt is still active
+ *	- an asynchronous error trap occurs while fp state exists (in
+ *	  userland or in kernel copy) and the tl0 component of the handling
+ *	  uses bcopy
+ *	- a user process with fp state incurs a copy-on-write fault and
+ *	  hwblkpagecopy always uses fp
+ *
+ * We therefore need a per-call place in which to preserve fp state -
+ * using our stack is ideal (and since fp copy cannot be leaf optimized
+ * because of calls it makes, this is no hardship).
+ *
+ * To make sure that floating point state is always saved and restored
+ * correctly, the following "big rules" must be followed when the floating
+ * point registers will be used:
+ *
+ * 1. %l6 always holds the caller's lofault handler.  Also in this register,
+ *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
+ *    use.  Bit 2 (BCOPY_FLAG) indicates that the call was to bcopy.
+ *
+ * 2. The FPUSED flag indicates that all FP state has been successfully stored
+ *    on the stack.  It should not be set until this save has been completed.
+ *
+ * 3. The FPUSED flag should not be cleared on exit until all FP state has
+ *    been restored from the stack.  If an error occurs while restoring
+ *    data from the stack, the error handler can check this flag to see if
+ *    a restore is necessary.
+ *
+ * 4. Code run under the new lofault handler must be kept to a minimum.  In
+ *    particular, any calls to kpreempt() should not be made until after the
+ *    lofault handler has been restored.
+ */
+
+/*
+ * This shadows sys/machsystm.h which can't be included due to the lack of
+ * _ASM guards in include files it references. Change it here, change it there.
+ */
+#define VIS_COPY_THRESHOLD 900
+
+/*
+ * Less then or equal this number of bytes we will always copy byte-for-byte
+ */
+#define	SMALL_LIMIT	7
+
+/*
+ * Flags set in the lower bits of the t_lofault address:
+ * FPUSED_FLAG: The FP registers were in use and must be restored
+ * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
+ * COPY_FLAGS: Both of the above
+ *
+ * Other flags:
+ * KPREEMPT_FLAG: kpreempt needs to be called
+ */
+#define	FPUSED_FLAG	1
+#define BCOPY_FLAG	2
+#define	COPY_FLAGS	(FPUSED_FLAG | BCOPY_FLAG)
+#define	KPREEMPT_FLAG	4
+
+/*
+ * Size of stack frame in order to accomodate a 64-byte aligned
+ * floating-point register save area and 2 32-bit temp locations.
+ */
+#define	HWCOPYFRAMESIZE	((64 * 5) + (2 * 4))
+
+#define SAVED_FPREGS_OFFSET	(64 * 5)
+#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 4)
+#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 4)
+
+/*
+ * Common macros used by the various versions of the block copy
+ * routines in this file.
+ */
+
+#define	FZERO				\
+	fzero	%f0			;\
+	fzero	%f2			;\
+	faddd	%f0, %f2, %f4		;\
+	fmuld	%f0, %f2, %f6		;\
+	faddd	%f0, %f2, %f8		;\
+	fmuld	%f0, %f2, %f10		;\
+	faddd	%f0, %f2, %f12		;\
+	fmuld	%f0, %f2, %f14		;\
+	faddd	%f0, %f2, %f16		;\
+	fmuld	%f0, %f2, %f18		;\
+	faddd	%f0, %f2, %f20		;\
+	fmuld	%f0, %f2, %f22		;\
+	faddd	%f0, %f2, %f24		;\
+	fmuld	%f0, %f2, %f26		;\
+	faddd	%f0, %f2, %f28		;\
+	fmuld	%f0, %f2, %f30		;\
+	faddd	%f0, %f2, %f32		;\
+	fmuld	%f0, %f2, %f34		;\
+	faddd	%f0, %f2, %f36		;\
+	fmuld	%f0, %f2, %f38		;\
+	faddd	%f0, %f2, %f40		;\
+	fmuld	%f0, %f2, %f42		;\
+	faddd	%f0, %f2, %f44		;\
+	fmuld	%f0, %f2, %f46		;\
+	faddd	%f0, %f2, %f48		;\
+	fmuld	%f0, %f2, %f50		;\
+	faddd	%f0, %f2, %f52		;\
+	fmuld	%f0, %f2, %f54		;\
+	faddd	%f0, %f2, %f56		;\
+	fmuld	%f0, %f2, %f58		;\
+	faddd	%f0, %f2, %f60		;\
+	fmuld	%f0, %f2, %f62
+
+
+#define	FALIGN_D0			\
+	faligndata %d0, %d2, %d48	;\
+	faligndata %d2, %d4, %d50	;\
+	faligndata %d4, %d6, %d52	;\
+	faligndata %d6, %d8, %d54	;\
+	faligndata %d8, %d10, %d56	;\
+	faligndata %d10, %d12, %d58	;\
+	faligndata %d12, %d14, %d60	;\
+	faligndata %d14, %d16, %d62
+
+#define	FALIGN_D16			\
+	faligndata %d16, %d18, %d48	;\
+	faligndata %d18, %d20, %d50	;\
+	faligndata %d20, %d22, %d52	;\
+	faligndata %d22, %d24, %d54	;\
+	faligndata %d24, %d26, %d56	;\
+	faligndata %d26, %d28, %d58	;\
+	faligndata %d28, %d30, %d60	;\
+	faligndata %d30, %d32, %d62
+
+#define	FALIGN_D32			\
+	faligndata %d32, %d34, %d48	;\
+	faligndata %d34, %d36, %d50	;\
+	faligndata %d36, %d38, %d52	;\
+	faligndata %d38, %d40, %d54	;\
+	faligndata %d40, %d42, %d56	;\
+	faligndata %d42, %d44, %d58	;\
+	faligndata %d44, %d46, %d60	;\
+	faligndata %d46, %d0, %d62
+
+#define	FALIGN_D2			\
+	faligndata %d2, %d4, %d48	;\
+	faligndata %d4, %d6, %d50	;\
+	faligndata %d6, %d8, %d52	;\
+	faligndata %d8, %d10, %d54	;\
+	faligndata %d10, %d12, %d56	;\
+	faligndata %d12, %d14, %d58	;\
+	faligndata %d14, %d16, %d60	;\
+	faligndata %d16, %d18, %d62
+
+#define	FALIGN_D18			\
+	faligndata %d18, %d20, %d48	;\
+	faligndata %d20, %d22, %d50	;\
+	faligndata %d22, %d24, %d52	;\
+	faligndata %d24, %d26, %d54	;\
+	faligndata %d26, %d28, %d56	;\
+	faligndata %d28, %d30, %d58	;\
+	faligndata %d30, %d32, %d60	;\
+	faligndata %d32, %d34, %d62
+
+#define	FALIGN_D34			\
+	faligndata %d34, %d36, %d48	;\
+	faligndata %d36, %d38, %d50	;\
+	faligndata %d38, %d40, %d52	;\
+	faligndata %d40, %d42, %d54	;\
+	faligndata %d42, %d44, %d56	;\
+	faligndata %d44, %d46, %d58	;\
+	faligndata %d46, %d0, %d60	;\
+	faligndata %d0, %d2, %d62
+
+#define	FALIGN_D4			\
+	faligndata %d4, %d6, %d48	;\
+	faligndata %d6, %d8, %d50	;\
+	faligndata %d8, %d10, %d52	;\
+	faligndata %d10, %d12, %d54	;\
+	faligndata %d12, %d14, %d56	;\
+	faligndata %d14, %d16, %d58	;\
+	faligndata %d16, %d18, %d60	;\
+	faligndata %d18, %d20, %d62
+
+#define	FALIGN_D20			\
+	faligndata %d20, %d22, %d48	;\
+	faligndata %d22, %d24, %d50	;\
+	faligndata %d24, %d26, %d52	;\
+	faligndata %d26, %d28, %d54	;\
+	faligndata %d28, %d30, %d56	;\
+	faligndata %d30, %d32, %d58	;\
+	faligndata %d32, %d34, %d60	;\
+	faligndata %d34, %d36, %d62
+
+#define	FALIGN_D36			\
+	faligndata %d36, %d38, %d48	;\
+	faligndata %d38, %d40, %d50	;\
+	faligndata %d40, %d42, %d52	;\
+	faligndata %d42, %d44, %d54	;\
+	faligndata %d44, %d46, %d56	;\
+	faligndata %d46, %d0, %d58	;\
+	faligndata %d0, %d2, %d60	;\
+	faligndata %d2, %d4, %d62
+
+#define	FALIGN_D6			\
+	faligndata %d6, %d8, %d48	;\
+	faligndata %d8, %d10, %d50	;\
+	faligndata %d10, %d12, %d52	;\
+	faligndata %d12, %d14, %d54	;\
+	faligndata %d14, %d16, %d56	;\
+	faligndata %d16, %d18, %d58	;\
+	faligndata %d18, %d20, %d60	;\
+	faligndata %d20, %d22, %d62
+
+#define	FALIGN_D22			\
+	faligndata %d22, %d24, %d48	;\
+	faligndata %d24, %d26, %d50	;\
+	faligndata %d26, %d28, %d52	;\
+	faligndata %d28, %d30, %d54	;\
+	faligndata %d30, %d32, %d56	;\
+	faligndata %d32, %d34, %d58	;\
+	faligndata %d34, %d36, %d60	;\
+	faligndata %d36, %d38, %d62
+
+#define	FALIGN_D38			\
+	faligndata %d38, %d40, %d48	;\
+	faligndata %d40, %d42, %d50	;\
+	faligndata %d42, %d44, %d52	;\
+	faligndata %d44, %d46, %d54	;\
+	faligndata %d46, %d0, %d56	;\
+	faligndata %d0, %d2, %d58	;\
+	faligndata %d2, %d4, %d60	;\
+	faligndata %d4, %d6, %d62
+
+#define	FALIGN_D8			\
+	faligndata %d8, %d10, %d48	;\
+	faligndata %d10, %d12, %d50	;\
+	faligndata %d12, %d14, %d52	;\
+	faligndata %d14, %d16, %d54	;\
+	faligndata %d16, %d18, %d56	;\
+	faligndata %d18, %d20, %d58	;\
+	faligndata %d20, %d22, %d60	;\
+	faligndata %d22, %d24, %d62
+
+#define	FALIGN_D24			\
+	faligndata %d24, %d26, %d48	;\
+	faligndata %d26, %d28, %d50	;\
+	faligndata %d28, %d30, %d52	;\
+	faligndata %d30, %d32, %d54	;\
+	faligndata %d32, %d34, %d56	;\
+	faligndata %d34, %d36, %d58	;\
+	faligndata %d36, %d38, %d60	;\
+	faligndata %d38, %d40, %d62
+
+#define	FALIGN_D40			\
+	faligndata %d40, %d42, %d48	;\
+	faligndata %d42, %d44, %d50	;\
+	faligndata %d44, %d46, %d52	;\
+	faligndata %d46, %d0, %d54	;\
+	faligndata %d0, %d2, %d56	;\
+	faligndata %d2, %d4, %d58	;\
+	faligndata %d4, %d6, %d60	;\
+	faligndata %d6, %d8, %d62
+
+#define	FALIGN_D10			\
+	faligndata %d10, %d12, %d48	;\
+	faligndata %d12, %d14, %d50	;\
+	faligndata %d14, %d16, %d52	;\
+	faligndata %d16, %d18, %d54	;\
+	faligndata %d18, %d20, %d56	;\
+	faligndata %d20, %d22, %d58	;\
+	faligndata %d22, %d24, %d60	;\
+	faligndata %d24, %d26, %d62
+
+#define	FALIGN_D26			\
+	faligndata %d26, %d28, %d48	;\
+	faligndata %d28, %d30, %d50	;\
+	faligndata %d30, %d32, %d52	;\
+	faligndata %d32, %d34, %d54	;\
+	faligndata %d34, %d36, %d56	;\
+	faligndata %d36, %d38, %d58	;\
+	faligndata %d38, %d40, %d60	;\
+	faligndata %d40, %d42, %d62
+
+#define	FALIGN_D42			\
+	faligndata %d42, %d44, %d48	;\
+	faligndata %d44, %d46, %d50	;\
+	faligndata %d46, %d0, %d52	;\
+	faligndata %d0, %d2, %d54	;\
+	faligndata %d2, %d4, %d56	;\
+	faligndata %d4, %d6, %d58	;\
+	faligndata %d6, %d8, %d60	;\
+	faligndata %d8, %d10, %d62
+
+#define	FALIGN_D12			\
+	faligndata %d12, %d14, %d48	;\
+	faligndata %d14, %d16, %d50	;\
+	faligndata %d16, %d18, %d52	;\
+	faligndata %d18, %d20, %d54	;\
+	faligndata %d20, %d22, %d56	;\
+	faligndata %d22, %d24, %d58	;\
+	faligndata %d24, %d26, %d60	;\
+	faligndata %d26, %d28, %d62
+
+#define	FALIGN_D28			\
+	faligndata %d28, %d30, %d48	;\
+	faligndata %d30, %d32, %d50	;\
+	faligndata %d32, %d34, %d52	;\
+	faligndata %d34, %d36, %d54	;\
+	faligndata %d36, %d38, %d56	;\
+	faligndata %d38, %d40, %d58	;\
+	faligndata %d40, %d42, %d60	;\
+	faligndata %d42, %d44, %d62
+
+#define	FALIGN_D44			\
+	faligndata %d44, %d46, %d48	;\
+	faligndata %d46, %d0, %d50	;\
+	faligndata %d0, %d2, %d52	;\
+	faligndata %d2, %d4, %d54	;\
+	faligndata %d4, %d6, %d56	;\
+	faligndata %d6, %d8, %d58	;\
+	faligndata %d8, %d10, %d60	;\
+	faligndata %d10, %d12, %d62
+
+#define	FALIGN_D14			\
+	faligndata %d14, %d16, %d48	;\
+	faligndata %d16, %d18, %d50	;\
+	faligndata %d18, %d20, %d52	;\
+	faligndata %d20, %d22, %d54	;\
+	faligndata %d22, %d24, %d56	;\
+	faligndata %d24, %d26, %d58	;\
+	faligndata %d26, %d28, %d60	;\
+	faligndata %d28, %d30, %d62
+
+#define	FALIGN_D30			\
+	faligndata %d30, %d32, %d48	;\
+	faligndata %d32, %d34, %d50	;\
+	faligndata %d34, %d36, %d52	;\
+	faligndata %d36, %d38, %d54	;\
+	faligndata %d38, %d40, %d56	;\
+	faligndata %d40, %d42, %d58	;\
+	faligndata %d42, %d44, %d60	;\
+	faligndata %d44, %d46, %d62
+
+#define	FALIGN_D46			\
+	faligndata %d46, %d0, %d48	;\
+	faligndata %d0, %d2, %d50	;\
+	faligndata %d2, %d4, %d52	;\
+	faligndata %d4, %d6, %d54	;\
+	faligndata %d6, %d8, %d56	;\
+	faligndata %d8, %d10, %d58	;\
+	faligndata %d10, %d12, %d60	;\
+	faligndata %d12, %d14, %d62
+
+
+/*
+ * Copy a block of storage, returning an error code if `from' or
+ * `to' takes a kernel pagefault which cannot be resolved.
+ * Returns errno value on pagefault error, 0 if all ok
+ */
+
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+int
+kcopy(const void *from, void *to, size_t count)
+{ return(0); }
+
+#else	/* lint */
+
+	.seg	".text"
+	.align	4
+
+	ENTRY(kcopy)
+
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	set	.copyerr, %l6		! copyerr is lofault value
+	ldn	[THREAD_REG + T_LOFAULT], %l7	! save existing handler
+	membar	#Sync			! sync error barrier (see copy.s)
+	stn	%l6, [THREAD_REG + T_LOFAULT]	! set t_lofault
+	!
+	! Note that we carefully do *not* flag the setting of
+	! t_lofault.
+	!
+	ba,pt	%ncc, .do_copy		! common code
+	  mov	%l7, %l6
+
+/*
+ * We got here because of a fault during kcopy or bcopy if a fault
+ * handler existed when bcopy was called. 
+ * Errno value is in %g1.
+ */
+.copyerr:
+	set	.copyerr2, %l1
+	membar	#Sync			! sync error barrier
+	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
+	btst	FPUSED_FLAG, %l6
+	bz	%icc, 1f
+	  and	%l6, BCOPY_FLAG, %l1	! copy flag to %l1
+
+	membar	#Sync
+
+	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
+	wr	%o2, 0, %gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz	%icc, 4f
+	  nop
+
+	! restore fpregs from stack
+	membar	#Sync
+	add	%fp, STACK_BIAS - 257, %o2
+	and	%o2, -64, %o2
+	ldda	[%o2]ASI_BLK_P, %d0
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d16
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d32
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d48
+	membar	#Sync
+
+	ba,pt	%ncc, 2f
+	  wr	%o3, 0, %fprs		! restore fprs
+
+4:
+	FZERO				! zero all of the fpregs
+	wr	%o3, 0, %fprs		! restore fprs
+
+2:	ldn	[THREAD_REG + T_LWP], %o2
+	tst	%o2
+	bnz,pt	%ncc, 1f
+	  nop
+
+	ldsb	[THREAD_REG + T_PREEMPT], %l0
+	deccc	%l0
+	bnz,pn	%ncc, 1f
+	  stb	%l0, [THREAD_REG + T_PREEMPT]
+
+	! Check for a kernel preemption request
+	ldn	[THREAD_REG + T_CPU], %l0
+	ldub	[%l0 + CPU_KPRUNRUN], %l0
+	tst	%l0
+	bnz,a,pt	%ncc, 1f	! Need to call kpreempt?
+	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
+
+	!
+	! Need to cater for the different expectations of kcopy
+	! and bcopy. kcopy will *always* set a t_lofault handler
+	! If it fires, we're expected to just return the error code
+	! and *not* to invoke any existing error handler. As far as
+	! bcopy is concerned, we only set t_lofault if there was an
+	! existing lofault handler. In that case we're expected to
+	! invoke the previously existing handler after restting the
+	! t_lofault value.
+	!
+1:
+	andn	%l6, COPY_FLAGS, %l6	! remove flags from lofault address
+	membar	#Sync			! sync error barrier
+	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+
+	! call kpreempt if necessary
+	btst	KPREEMPT_FLAG, %l1
+	bz,pt	%icc, 2f
+	  nop
+	call	kpreempt
+	  rdpr	%pil, %o0	! pass %pil
+2:
+	btst	BCOPY_FLAG, %l1
+	bnz,pn	%ncc, 3f
+	  nop
+	ret
+	restore	%g1, 0, %o0
+
+3:
+	!
+	! We're here via bcopy. There *must* have been an error handler
+	! in place otheerwise we would have died a nasty death already.
+	!
+	jmp	%l6				! goto real handler
+	restore	%g0, 0, %o0			! dispose of copy window
+
+/*
+ * We got here because of a fault in .copyerr.  We can't safely restore fp
+ * state, so we panic.
+ */
+fp_panic_msg:
+	.asciz	"Unable to restore fp state after copy operation"
+
+	.align	4
+.copyerr2:
+	set	fp_panic_msg, %o0
+	call	panic
+	  nop
+	SET_SIZE(kcopy)
+#endif	/* lint */
+
+
+/*
+ * Copy a block of storage - must not overlap (from + len <= to).
+ * Registers: l6 - saved t_lofault
+ *
+ * Copy a page of memory.
+ * Assumes double word alignment and a count >= 256.
+ */
+#if defined(lint)
+
+/* ARGSUSED */
+void
+bcopy(const void *from, void *to, size_t count)
+{}
+
+#else	/* lint */
+
+	ENTRY(bcopy)
+
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
+	tst	%l6
+        !
+        ! We've already captured whether t_lofault was zero on entry.
+        ! We need to mark ourselves as being from bcopy since both
+        ! kcopy and bcopy use the same code path. If BCOPY_FLAG is set
+        ! and the saved lofault was zero, we won't reset lofault on
+        ! returning.
+        !
+	or	%l6, BCOPY_FLAG, %l6
+	bz,pt	%ncc, .do_copy
+	sethi	%hi(.copyerr), %o2
+	or	%o2, %lo(.copyerr), %o2
+	membar	#Sync			! sync error barrier
+	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
+
+.do_copy:
+	cmp	%i2, 12			! for small counts
+	blu	%ncc, .bytecp		! just copy bytes
+	  .empty
+
+	cmp	%i2, VIS_COPY_THRESHOLD	! for large counts
+	blu,pt	%ncc, .bcb_punt
+	  .empty
+
+	!
+	! Check to see if VIS acceleration is enabled
+	!
+	sethi	%hi(use_hw_bcopy), %o2
+	ld	[%o2 + %lo(use_hw_bcopy)], %o2
+	tst	%o2
+	bz,pn	%icc, .bcb_punt
+	  nop
+
+	subcc	%i1, %i0, %i3
+	bneg,a,pn %ncc, 1f
+	neg	%i3
+1:
+	/*
+	 * Compare against 256 since we should be checking block addresses
+	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
+	 * src = dest + (64 * 3) + 63.
+	 */
+	cmp	%i3, 256
+	blu,pn	%ncc, .bcb_punt
+	  nop
+
+	ldn	[THREAD_REG + T_LWP], %o3
+	tst	%o3
+	bnz,pt	%ncc, 1f
+	  nop
+
+	! kpreempt_disable();
+	ldsb	[THREAD_REG + T_PREEMPT], %o2
+	inc	%o2
+	stb	%o2, [THREAD_REG + T_PREEMPT]
+
+1:
+	rd	%fprs, %o2		! check for unused fp
+	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
+	btst	FPRS_FEF, %o2
+	bz,a	%icc, .do_blockcopy
+	  wr	%g0, FPRS_FEF, %fprs
+
+.bcb_fpregs_inuse:
+	cmp	%i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
+	bgeu	%ncc, 1f		!  if we have to save the fpregs)
+	  nop
+
+	tst	%o3
+	bnz,pt	%ncc, .bcb_punt
+	  nop
+
+	ldsb	[THREAD_REG + T_PREEMPT], %l0
+	deccc	%l0
+	bnz,pn	%icc, .bcb_punt
+	  stb	%l0, [THREAD_REG + T_PREEMPT]
+
+	! Check for a kernel preemption request
+	ldn	[THREAD_REG + T_CPU], %l0
+	ldub	[%l0 + CPU_KPRUNRUN], %l0
+	tst	%l0
+	bz,pt	%icc, .bcb_punt
+	  nop
+
+	! Attempt to preempt
+	call	kpreempt
+	  rdpr	  %pil, %o0		  ! pass %pil
+
+	ba,pt	%ncc, .bcb_punt
+	  nop
+
+1:
+	wr	%g0, FPRS_FEF, %fprs
+
+	! save in-use fpregs on stack
+	membar	#Sync
+	add	%fp, STACK_BIAS - 257, %o2
+	and	%o2, -64, %o2
+	stda	%d0, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d16, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d32, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d48, [%o2]ASI_BLK_P
+	membar	#Sync
+
+.do_blockcopy:
+	membar	#StoreStore|#StoreLoad|#LoadStore
+
+	rd	%gsr, %o2
+	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
+
+	! Set the lower bit in the saved t_lofault to indicate
+	! that we need to clear the %fprs register on the way
+	! out
+	or	%l6, FPUSED_FLAG, %l6
+
+	! Swap src/dst since the code below is memcpy code
+	! and memcpy/bcopy have different calling sequences
+	mov	%i1, %i5
+	mov	%i0, %i1
+	mov	%i5, %i0
+
+!!! This code is nearly identical to the version in the sun4u
+!!! libc_psr.  Most bugfixes made to that file should be
+!!! merged into this routine.
+
+	andcc	%i0, 7, %o3
+	bz,pt	%ncc, blkcpy
+	sub	%o3, 8, %o3
+	neg	%o3
+	sub	%i2, %o3, %i2
+
+	! Align Destination on double-word boundary
+
+2:	ldub	[%i1], %o4
+	inc	%i1
+	inc	%i0
+	deccc	%o3
+	bgu	%ncc, 2b
+	stb	%o4, [%i0 - 1]
+blkcpy:	
+	andcc	%i0, 63, %i3
+	bz,pn	%ncc, blalign		! now block aligned
+	sub	%i3, 64, %i3
+	neg	%i3			! bytes till block aligned
+	sub	%i2, %i3, %i2		! update %i2 with new count
+
+	! Copy %i3 bytes till dst is block (64 byte) aligned. use
+	! double word copies.
+
+	alignaddr %i1, %g0, %g1
+	ldd	[%g1], %d0
+	add	%g1, 8, %g1
+6:
+	ldd	[%g1], %d2
+	add	%g1, 8, %g1
+	subcc	%i3, 8, %i3
+	faligndata %d0, %d2, %d8
+	std	%d8, [%i0]
+	add	%i1, 8, %i1
+	bz,pn	%ncc, blalign
+	add	%i0, 8, %i0
+	ldd	[%g1], %d0
+	add	%g1, 8, %g1
+	subcc	%i3, 8, %i3
+	faligndata %d2, %d0, %d8
+	std	%d8, [%i0]
+	add	%i1, 8, %i1
+	bgu,pn	%ncc, 6b
+	add	%i0, 8, %i0
+ 
+blalign:
+	membar	#StoreLoad
+	! %i2 = total length
+	! %i3 = blocks	(length - 64) / 64
+	! %i4 = doubles remaining  (length - blocks)
+	sub	%i2, 64, %i3
+	andn	%i3, 63, %i3
+	sub	%i2, %i3, %i4
+	andn	%i4, 7, %i4
+	sub	%i4, 16, %i4
+	sub	%i2, %i4, %i2
+	sub	%i2, %i3, %i2
+
+	andn	%i1, 0x3f, %l7		! blk aligned address
+	alignaddr %i1, %g0, %g0		! gen %gsr
+
+	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
+	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
+	add	%i1, %i4, %i1
+	add	%i1, %i3, %i1
+
+	ldda	[%l7]ASI_BLK_P, %d0
+	add	%l7, 64, %l7
+	ldda	[%l7]ASI_BLK_P, %d16
+	add	%l7, 64, %l7
+	ldda	[%l7]ASI_BLK_P, %d32
+	add	%l7, 64, %l7
+	sub	%i3, 128, %i3
+
+	! switch statement to get us to the right 8 byte blk within a
+	! 64 byte block
+	cmp	 %i5, 4
+	bgeu,a	 hlf
+	cmp	 %i5, 6
+	cmp	 %i5, 2
+	bgeu,a	 sqtr
+	nop
+	cmp	 %i5, 1
+	be,a	 seg1
+	nop
+	ba,pt	 %ncc, seg0
+	nop
+sqtr:
+	be,a	 seg2
+	nop
+	ba,pt	 %ncc, seg3
+	nop
+
+hlf:
+	bgeu,a	 fqtr
+	nop	 
+	cmp	 %i5, 5
+	be,a	 seg5
+	nop
+	ba,pt	 %ncc, seg4
+	nop
+fqtr:
+	be,a	 seg6
+	nop
+	ba,pt	 %ncc, seg7
+	nop
+	
+
+seg0:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D0
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D16
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D32
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, seg0
+
+0:
+	FALIGN_D16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D32
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd0
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D0
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd16
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D16
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd32
+	add	%i0, 64, %i0
+
+seg1:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D2
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D18
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D34
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, seg1
+0:
+	FALIGN_D18
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D34
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd2
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D34
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D2
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd18
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D2
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D18
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd34
+	add	%i0, 64, %i0
+
+seg2:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D4
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D20
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D36
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, seg2
+
+0:
+	FALIGN_D20
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D36
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd4
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D36
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D4
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd20
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D4
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D20
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd36
+	add	%i0, 64, %i0
+
+seg3:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D6
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D22
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D38
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, seg3
+
+0:
+	FALIGN_D22
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D38
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd6
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D38
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D6
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd22
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D6
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D22
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd38
+	add	%i0, 64, %i0
+
+seg4:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D8
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D24
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D40
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, seg4
+
+0:
+	FALIGN_D24
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D40
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd8
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D40
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D8
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd24
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D8
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D24
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd40
+	add	%i0, 64, %i0
+
+seg5:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D10
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D26
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D42
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, seg5
+
+0:
+	FALIGN_D26
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D42
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd10
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D42
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D10
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd26
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D10
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D26
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd42
+	add	%i0, 64, %i0
+
+seg6:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D12
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D28
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D44
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, seg6
+
+0:
+	FALIGN_D28
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D44
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd12
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D44
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D12
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd28
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D12
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D28
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd44
+	add	%i0, 64, %i0
+
+seg7:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D14
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D30
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D46
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, seg7
+
+0:
+	FALIGN_D30
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D46
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd14
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D46
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D14
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd30
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D14
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D30
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, blkd46
+	add	%i0, 64, %i0
+
+
+	!
+	! dribble out the last partial block
+	!
+blkd0:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d0, %d2, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd2:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d2, %d4, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd4:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d4, %d6, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd6:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d6, %d8, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd8:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d8, %d10, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd10:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d10, %d12, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd12:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d12, %d14, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd14:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	fsrc1	%d14, %d0
+	ba,a,pt	%ncc, blkleft
+
+blkd16:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d16, %d18, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd18:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d18, %d20, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd20:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d20, %d22, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd22:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d22, %d24, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd24:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d24, %d26, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd26:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d26, %d28, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd28:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d28, %d30, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd30:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	fsrc1	%d30, %d0
+	ba,a,pt	%ncc, blkleft
+blkd32:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d32, %d34, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd34:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d34, %d36, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd36:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d36, %d38, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd38:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d38, %d40, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd40:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d40, %d42, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd42:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d42, %d44, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd44:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	faligndata %d44, %d46, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+blkd46:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, blkdone
+	fsrc1	%d46, %d0
+
+blkleft:
+1:
+	ldd	[%l7], %d2
+	add	%l7, 8, %l7
+	subcc	%i4, 8, %i4
+	faligndata %d0, %d2, %d8
+	std	%d8, [%i0]
+	blu,pn	%ncc, blkdone
+	add	%i0, 8, %i0
+	ldd	[%l7], %d0
+	add	%l7, 8, %l7
+	subcc	%i4, 8, %i4
+	faligndata %d2, %d0, %d8
+	std	%d8, [%i0]
+	bgeu,pt	%ncc, 1b
+	add	%i0, 8, %i0
+
+blkdone:
+	tst	%i2
+	bz,pt	%ncc, .bcb_exit
+	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
+
+7:	ldub	[%i1], %i4
+	inc	%i1
+	inc	%i0
+	deccc	%i2
+	bgu,pt	%ncc, 7b
+	  stb	  %i4, [%i0 - 1]
+
+.bcb_exit:
+	membar	#StoreLoad|#StoreStore
+	btst	FPUSED_FLAG, %l6
+	bz	%icc, 1f
+	  and	%l6, COPY_FLAGS, %l1	! Store flags in %l1
+					! We can't clear the flags from %l6 yet.
+					! If there's an error, .copyerr will
+					! need them
+
+	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
+	wr	%o2, 0, %gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz	%icc, 4f
+	  nop
+
+	! restore fpregs from stack
+	membar	#Sync
+	add	%fp, STACK_BIAS - 257, %o2
+	and	%o2, -64, %o2
+	ldda	[%o2]ASI_BLK_P, %d0
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d16
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d32
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d48
+	membar	#Sync
+
+	ba,pt	%ncc, 2f	
+	  wr	%o3, 0, %fprs		! restore fprs
+
+4:
+	FZERO				! zero all of the fpregs
+	wr	%o3, 0, %fprs		! restore fprs
+
+2:	ldn	[THREAD_REG + T_LWP], %o2
+	tst	%o2
+	bnz,pt	%ncc, 1f
+	  nop
+
+	ldsb	[THREAD_REG + T_PREEMPT], %l0
+	deccc	%l0
+	bnz,pn	%ncc, 1f
+	  stb	%l0, [THREAD_REG + T_PREEMPT]
+
+	! Check for a kernel preemption request
+	ldn	[THREAD_REG + T_CPU], %l0
+	ldub	[%l0 + CPU_KPRUNRUN], %l0
+	tst	%l0
+	bnz,a,pt	%ncc, 1f	! Need to call kpreempt?
+	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
+
+1:
+	btst	BCOPY_FLAG, %l1
+	bz,pn	%icc, 3f
+	  andncc	%l6, COPY_FLAGS, %l6
+
+	!
+	! Here via bcopy. Check to see if the handler was NULL.
+	! If so, just return quietly. Otherwise, reset the
+	! handler and go home.
+	! 
+	bnz,pn	%ncc, 3f
+	  nop
+
+	!
+	! Null handler.  Check for kpreempt flag, call if necessary,
+	! then return.
+	!
+	btst	KPREEMPT_FLAG, %l1
+	bz,pt	%icc, 2f
+	  nop
+	call	kpreempt
+	  rdpr	%pil, %o0	! pass %pil
+2:
+	ret
+	  restore	%g0, 0, %o0
+
+	!
+	! Here via kcopy or bcopy with a handler.Reset the
+	! fault handler.
+	!
+3:
+	membar	#Sync
+	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+
+	! call kpreempt if necessary
+	btst	KPREEMPT_FLAG, %l1
+	bz,pt	%icc, 4f
+	  nop
+	call	kpreempt
+	  rdpr	%pil, %o0
+4:
+	ret
+	  restore	%g0, 0, %o0
+
+.bcb_punt:
+	!
+	! use aligned transfers where possible
+	!
+	xor	%i0, %i1, %o4		! xor from and to address
+	btst	7, %o4			! if lower three bits zero
+	bz	%icc, .aldoubcp		! can align on double boundary
+	.empty	! assembler complaints about label
+
+	xor	%i0, %i1, %o4		! xor from and to address
+	btst	3, %o4			! if lower two bits zero
+	bz	%icc, .alwordcp		! can align on word boundary
+	btst	3, %i0			! delay slot, from address unaligned?
+	!
+	! use aligned reads and writes where possible
+	! this differs from wordcp in that it copes
+	! with odd alignment between source and destnation
+	! using word reads and writes with the proper shifts
+	! in between to align transfers to and from memory
+	! i0 - src address, i1 - dest address, i2 - count
+	! i3, i4 - tmps for used generating complete word
+	! i5 (word to write)
+	! l0 size in bits of upper part of source word (US)
+	! l1 size in bits of lower part of source word (LS = 32 - US)
+	! l2 size in bits of upper part of destination word (UD)
+	! l3 size in bits of lower part of destination word (LD = 32 - UD)
+	! l4 number of bytes leftover after aligned transfers complete
+	! l5 the number 32
+	!
+	mov	32, %l5			! load an oft-needed constant
+	bz	.align_dst_only
+	btst	3, %i1			! is destnation address aligned?
+	clr	%i4			! clear registers used in either case
+	bz	%icc, .align_src_only
+	clr	%l0
+	!
+	! both source and destination addresses are unaligned
+	!
+1:					! align source
+	ldub	[%i0], %i3		! read a byte from source address
+	add	%i0, 1, %i0		! increment source address
+	or	%i4, %i3, %i4		! or in with previous bytes (if any)
+	btst	3, %i0			! is source aligned?
+	add	%l0, 8, %l0		! increment size of upper source (US)
+	bnz,a	1b
+	sll	%i4, 8, %i4		! make room for next byte
+
+	sub	%l5, %l0, %l1		! generate shift left count (LS)
+	sll	%i4, %l1, %i4		! prepare to get rest
+	ld	[%i0], %i3		! read a word
+	add	%i0, 4, %i0		! increment source address
+	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
+	or	%i4, %i5, %i5		! merge
+	mov	24, %l3			! align destination
+1:
+	srl	%i5, %l3, %i4		! prepare to write a single byte
+	stb	%i4, [%i1]		! write a byte
+	add	%i1, 1, %i1		! increment destination address
+	sub	%i2, 1, %i2		! decrement count
+	btst	3, %i1			! is destination aligned?
+	bnz,a	1b
+	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
+	sub	%l5, %l3, %l2		! generate shift left count (UD)
+	sll	%i5, %l2, %i5		! move leftover into upper bytes
+	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
+	bgu	%ncc, .more_needed	! need more to fill than we have
+	nop
+
+	sll	%i3, %l1, %i3		! clear upper used byte(s)
+	srl	%i3, %l1, %i3
+	! get the odd bytes between alignments
+	sub	%l0, %l2, %l0		! regenerate shift count
+	sub	%l5, %l0, %l1		! generate new shift left count (LS)
+	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
+	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
+	srl	%i3, %l0, %i4
+	or	%i5, %i4, %i5
+	st	%i5, [%i1]		! write a word
+	subcc	%i2, 4, %i2		! decrement count
+	bz	%ncc, .unalign_out
+	add	%i1, 4, %i1		! increment destination address
+
+	b	2f
+	sll	%i3, %l1, %i5		! get leftover into upper bits
+.more_needed:
+	sll	%i3, %l0, %i3		! save remaining byte(s)
+	srl	%i3, %l0, %i3
+	sub	%l2, %l0, %l1		! regenerate shift count
+	sub	%l5, %l1, %l0		! generate new shift left count
+	sll	%i3, %l1, %i4		! move to fill empty space
+	b	3f
+	or	%i5, %i4, %i5		! merge to complete word
+	!
+	! the source address is aligned and destination is not
+	!
+.align_dst_only:
+	ld	[%i0], %i4		! read a word
+	add	%i0, 4, %i0		! increment source address
+	mov	24, %l0			! initial shift alignment count
+1:
+	srl	%i4, %l0, %i3		! prepare to write a single byte
+	stb	%i3, [%i1]		! write a byte
+	add	%i1, 1, %i1		! increment destination address
+	sub	%i2, 1, %i2		! decrement count
+	btst	3, %i1			! is destination aligned?
+	bnz,a	1b
+	sub	%l0, 8, %l0		! delay slot, decrement shift count
+.xfer:
+	sub	%l5, %l0, %l1		! generate shift left count
+	sll	%i4, %l1, %i5		! get leftover
+3:
+	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
+	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
+2:
+	ld	[%i0], %i3		! read a source word
+	add	%i0, 4, %i0		! increment source address
+	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
+	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
+	st	%i5, [%i1]		! write a destination word
+	subcc	%i2, 4, %i2		! decrement count
+	bz	%ncc, .unalign_out	! check if done
+	add	%i1, 4, %i1		! increment destination address
+	b	2b			! loop
+	sll	%i3, %l1, %i5		! get leftover
+.unalign_out:
+	tst	%l4			! any bytes leftover?
+	bz	%ncc, .cpdone
+	.empty				! allow next instruction in delay slot
+1:
+	sub	%l0, 8, %l0		! decrement shift
+	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
+	stb	%i4, [%i1]		! write a byte
+	subcc	%l4, 1, %l4		! decrement count
+	bz	%ncc, .cpdone		! done?
+	add	%i1, 1, %i1		! increment destination
+	tst	%l0			! any more previously read bytes
+	bnz	%ncc, 1b		! we have leftover bytes
+	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
+	b	.dbytecp		! let dbytecp do the rest
+	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
+	!
+	! the destination address is aligned and the source is not
+	!
+.align_src_only:
+	ldub	[%i0], %i3		! read a byte from source address
+	add	%i0, 1, %i0		! increment source address
+	or	%i4, %i3, %i4		! or in with previous bytes (if any)
+	btst	3, %i0			! is source aligned?
+	add	%l0, 8, %l0		! increment shift count (US)
+	bnz,a	.align_src_only
+	sll	%i4, 8, %i4		! make room for next byte
+	b,a	.xfer
+	!
+	! if from address unaligned for double-word moves,
+	! move bytes till it is, if count is < 56 it could take
+	! longer to align the thing than to do the transfer
+	! in word size chunks right away
+	!
+.aldoubcp:
+	cmp	%i2, 56			! if count < 56, use wordcp, it takes
+	blu,a	%ncc, .alwordcp		! longer to align doubles than words
+	mov	3, %o0			! mask for word alignment
+	call	.alignit		! copy bytes until aligned
+	mov	7, %o0			! mask for double alignment
+	!
+	! source and destination are now double-word aligned
+	! i3 has aligned count returned by alignit
+	!
+	and	%i2, 7, %i2		! unaligned leftover count
+	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
+5:
+	ldx	[%i0+%i1], %o4		! read from address
+	stx	%o4, [%i1]		! write at destination address
+	subcc	%i3, 8, %i3		! dec count
+	bgu	%ncc, 5b
+	add	%i1, 8, %i1		! delay slot, inc to address
+	cmp	%i2, 4			! see if we can copy a word
+	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
+	.empty
+	!
+	! for leftover bytes we fall into wordcp, if needed
+	!
+.wordcp:
+	and	%i2, 3, %i2		! unaligned leftover count
+5:
+	ld	[%i0+%i1], %o4		! read from address
+	st	%o4, [%i1]		! write at destination address
+	subcc	%i3, 4, %i3		! dec count
+	bgu	%ncc, 5b
+	add	%i1, 4, %i1		! delay slot, inc to address
+	b,a	.dbytecp
+
+	! we come here to align copies on word boundaries
+.alwordcp:
+	call	.alignit		! go word-align it
+	mov	3, %o0			! bits that must be zero to be aligned
+	b	.wordcp
+	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
+
+	!
+	! byte copy, works with any alignment
+	!
+.bytecp:
+	b	.dbytecp
+	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
+
+	!
+	! differenced byte copy, works with any alignment
+	! assumes dest in %i1 and (source - dest) in %i0
+	!
+1:
+	stb	%o4, [%i1]		! write to address
+	inc	%i1			! inc to address
+.dbytecp:
+	deccc	%i2			! dec count
+	bgeu,a	%ncc, 1b		! loop till done
+	ldub	[%i0+%i1], %o4		! read from address
+	!
+	! FPUSED_FLAG will not have been set in any path leading to
+	! this point. No need to deal with it.
+	!
+.cpdone:
+	btst	BCOPY_FLAG, %l6
+	bz,pn	%icc, 2f
+	andncc	%l6, BCOPY_FLAG, %l6
+	!
+	! Here via bcopy. Check to see if the handler was NULL.
+	! If so, just return quietly. Otherwise, reset the
+	! handler and go home.
+	!
+	bnz,pn	%ncc, 2f
+	nop
+	!
+	! Null handler.
+	!
+	ret
+	restore %g0, 0, %o0
+	!
+	! Here via kcopy or bcopy with a handler.Reset the
+	! fault handler.
+	!
+2:
+  	membar	#Sync
+	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	ret
+	restore	%g0, 0, %o0		! return (0)
+
+/*
+ * Common code used to align transfers on word and doubleword
+ * boudaries.  Aligns source and destination and returns a count
+ * of aligned bytes to transfer in %i3
+ */
+1:
+	inc	%i0			! inc from
+	stb	%o4, [%i1]		! write a byte
+	inc	%i1			! inc to
+	dec	%i2			! dec count
+.alignit:
+	btst	%o0, %i0		! %o0 is bit mask to check for alignment
+	bnz,a	1b
+	ldub	[%i0], %o4		! read next byte
+
+	retl
+	andn	%i2, %o0, %i3		! return size of aligned bytes
+	SET_SIZE(bcopy)
+
+#endif	/* lint */
+
+/*
+ * Block copy with possibly overlapped operands.
+ */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+ovbcopy(const void *from, void *to, size_t count)
+{}
+
+#else	/* lint */
+
+	ENTRY(ovbcopy)
+	tst	%o2			! check count
+	bgu,a	%ncc, 1f		! nothing to do or bad arguments
+	subcc	%o0, %o1, %o3		! difference of from and to address
+
+	retl				! return
+	nop
+1:
+	bneg,a	%ncc, 2f
+	neg	%o3			! if < 0, make it positive
+2:	cmp	%o2, %o3		! cmp size and abs(from - to)
+	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
+	.empty				!   no overlap
+	cmp	%o0, %o1		! compare from and to addresses
+	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
+	nop
+	!
+	! Copy forwards.
+	!
+.ov_fwd:
+	ldub	[%o0], %o3		! read from address
+	inc	%o0			! inc from address
+	stb	%o3, [%o1]		! write to address
+	deccc	%o2			! dec count
+	bgu	%ncc, .ov_fwd		! loop till done
+	inc	%o1			! inc to address
+
+	retl				! return
+	nop
+	!
+	! Copy backwards.
+	!
+.ov_bkwd:
+	deccc	%o2			! dec count
+	ldub	[%o0 + %o2], %o3	! get byte at end of src
+	bgu	%ncc, .ov_bkwd		! loop till done
+	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
+
+	retl				! return
+	nop
+	SET_SIZE(ovbcopy)
+
+#endif	/* lint */
+
+/*
+ * hwblkpagecopy()
+ *
+ * Copies exactly one page.  This routine assumes the caller (ppcopy)
+ * has already disabled kernel preemption and has checked
+ * use_hw_bcopy.
+ */
+#ifdef lint
+/*ARGSUSED*/
+void
+hwblkpagecopy(const void *src, void *dst)
+{ }
+#else /* lint */
+	ENTRY(hwblkpagecopy)
+	! get another window w/space for three aligned blocks of saved fpregs
+	save	%sp, -SA(MINFRAME + 4*64), %sp
+
+	! %i0 - source address (arg)
+	! %i1 - destination address (arg)
+	! %i2 - length of region (not arg)
+	! %l0 - saved fprs
+	! %l1 - pointer to saved fpregs
+
+	rd	%fprs, %l0		! check for unused fp
+	btst	FPRS_FEF, %l0
+	bz	1f
+	membar	#Sync
+
+	! save in-use fpregs on stack
+	add	%fp, STACK_BIAS - 193, %l1
+	and	%l1, -64, %l1
+	stda	%d0, [%l1]ASI_BLK_P
+	add	%l1, 64, %l3
+	stda	%d16, [%l3]ASI_BLK_P
+	add	%l3, 64, %l3
+	stda	%d32, [%l3]ASI_BLK_P
+	membar	#Sync
+
+1:	wr	%g0, FPRS_FEF, %fprs
+	ldda	[%i0]ASI_BLK_P, %d0
+	add	%i0, 64, %i0
+	set	PAGESIZE - 64, %i2
+
+2:	ldda	[%i0]ASI_BLK_P, %d16
+	fsrc1	%d0, %d32
+	fsrc1	%d2, %d34
+	fsrc1	%d4, %d36
+	fsrc1	%d6, %d38
+	fsrc1	%d8, %d40
+	fsrc1	%d10, %d42
+	fsrc1	%d12, %d44
+	fsrc1	%d14, %d46
+	stda	%d32, [%i1]ASI_BLK_P
+	add	%i0, 64, %i0
+	subcc	%i2, 64, %i2
+	bz,pn	%ncc, 3f
+	add	%i1, 64, %i1
+	ldda	[%i0]ASI_BLK_P, %d0
+	fsrc1	%d16, %d32
+	fsrc1	%d18, %d34
+	fsrc1	%d20, %d36
+	fsrc1	%d22, %d38
+	fsrc1	%d24, %d40
+	fsrc1	%d26, %d42
+	fsrc1	%d28, %d44
+	fsrc1	%d30, %d46
+	stda	%d32, [%i1]ASI_BLK_P
+	add	%i0, 64, %i0
+	sub	%i2, 64, %i2
+	ba,pt	%ncc, 2b
+	add	%i1, 64, %i1
+
+3:	membar	#Sync
+	btst	FPRS_FEF, %l0
+	bz	4f
+	stda	%d16, [%i1]ASI_BLK_P
+
+	! restore fpregs from stack
+	membar	#Sync
+	ldda	[%l1]ASI_BLK_P, %d0
+	add	%l1, 64, %l3
+	ldda	[%l3]ASI_BLK_P, %d16
+	add	%l3, 64, %l3
+	ldda	[%l3]ASI_BLK_P, %d32
+
+4:	wr	%l0, 0, %fprs		! restore fprs
+	membar #Sync
+	ret
+	restore	%g0, 0, %o0
+	SET_SIZE(hwblkpagecopy)
+#endif	/* lint */
+
+
+/*
+ * Transfer data to and from user space -
+ * Note that these routines can cause faults
+ * It is assumed that the kernel has nothing at
+ * less than KERNELBASE in the virtual address space.
+ *
+ * Note that copyin(9F) and copyout(9F) are part of the
+ * DDI/DKI which specifies that they return '-1' on "errors."
+ *
+ * Sigh.
+ *
+ * So there's two extremely similar routines - xcopyin() and xcopyout()
+ * which return the errno that we've faithfully computed.  This
+ * allows other callers (e.g. uiomove(9F)) to work correctly.
+ * Given that these are used pretty heavily, we expand the calling
+ * sequences inline for all flavours (rather than making wrappers).
+ *
+ * There are also stub routines for xcopyout_little and xcopyin_little,
+ * which currently are intended to handle requests of <= 16 bytes from
+ * do_unaligned. Future enhancement to make them handle 8k pages efficiently
+ * is left as an exercise...
+ */
+
+/*
+ * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
+ *
+ * General theory of operation:
+ *
+ * The only difference between default_copy{in,out} and
+ * default_xcopy{in,out} is in the error handling routine they invoke
+ * when a memory access error is seen. default_xcopyOP returns the errno
+ * while default_copyOP returns -1 (see above). copy{in,out}_noerr set
+ * a special flag (by oring the value 2 into the fault handler address)
+ * if they are called with a fault handler already in place. That flag
+ * causes the default handlers to trampoline to the previous handler
+ * upon an error.
+ *
+ * None of the copyops routines grab a window until it's decided that
+ * we need to do a HW block copy operation. This saves a window
+ * spill/fill when we're called during socket ops. The typical IO
+ * path won't cause spill/fill traps.
+ *
+ * This code uses a set of 4 limits for the maximum size that will
+ * be copied given a particular input/output address alignment.
+ * the default limits are:
+ *
+ * single byte aligned - 900 (hw_copy_limit_1)
+ * two byte aligned - 1800 (hw_copy_limit_2)
+ * four byte aligned - 3600 (hw_copy_limit_4)
+ * eight byte aligned - 7200 (hw_copy_limit_8)
+ *
+ * If the value for a particular limit is zero, the copy will be done
+ * via the copy loops rather than VIS.
+ *
+ * Flow:
+ *
+ * If count == zero return zero.
+ *
+ * Store the previous lo_fault handler into %g6.
+ * Place our secondary lofault handler into %g5.
+ * Place the address of our nowindow fault handler into %o3.
+ * Place the address of the windowed fault handler into %o4.
+ * --> We'll use this handler if we end up grabbing a window
+ * --> before we use VIS instructions.
+ *
+ * If count is less than or equal to SMALL_LIMIT (7) we
+ * always do a byte for byte copy.
+ *
+ * If count is > SMALL_LIMIT, we check the alignment of the input
+ * and output pointers. Based on the alignment we check count
+ * against a soft limit of VIS_COPY_THRESHOLD (900 on spitfire). If
+ * we're larger than VIS_COPY_THRESHOLD, we check against a limit based
+ * on detected alignment. If we exceed the alignment value we copy
+ * via VIS instructions.
+ *
+ * If we don't exceed one of the limits, we store -count in %o3,
+ * we store the number of chunks (8, 4, 2 or 1 byte) operated
+ * on in our basic copy loop in %o2. Following this we branch 
+ * to the appropriate copy loop and copy that many chunks.
+ * Since we've been adding the chunk size to %o3 each time through
+ * as well as decrementing %o2, we can tell if any data is
+ * is left to be copied by examining %o3. If that is zero, we're
+ * done and can go home. If not, we figure out what the largest
+ * chunk size left to be copied is and branch to that copy loop
+ * unless there's only one byte left. We load that as we're
+ * branching to code that stores it just before we return.
+ *
+ * There is one potential situation in which we start to do a VIS
+ * copy but decide to punt and return to the copy loops. There is
+ * (in the default configuration) a window of 256 bytes between
+ * the single byte aligned copy limit and what VIS treats as its
+ * minimum if floating point is in use in the calling app. We need
+ * to be prepared to handle this. See the .small_copyOP label for
+ * details.
+ *
+ * Fault handlers are invoked if we reference memory that has no
+ * current mapping.  All forms share the same copyio_fault handler.
+ * This routine handles fixing up the stack and general housecleaning.
+ * Each copy operation has a simple fault handler that is then called
+ * to do the work specific to the invidual operation.  The handlers
+ * for default_copyOP and copyOP_noerr are found at the end of
+ * default_copyout. The handlers for default_xcopyOP are found at the
+ * end of xdefault_copyin.
+ */
+
+/*
+ * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
+ */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+int
+copyout(const void *kaddr, void *uaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+/*
+ * We save the arguments in the following registers in case of a fault:
+ * 	kaddr - %g2
+ * 	uaddr - %g3
+ * 	count - %g4
+ */
+#define	SAVE_SRC	%g2
+#define	SAVE_DST	%g3
+#define	SAVE_COUNT	%g4
+
+#define	REAL_LOFAULT		%g5
+#define	SAVED_LOFAULT		%g6
+
+/*
+ * Generic copyio fault handler.  This is the first line of defense when a 
+ * fault occurs in (x)copyin/(x)copyout.  In order for this to function
+ * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
+ * This allows us to share common code for all the flavors of the copy
+ * operations, including the _noerr versions.
+ *
+ * Note that this function will restore the original input parameters before
+ * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
+ * member of the t_copyop structure, if needed.
+ */
+	ENTRY(copyio_fault)
+	btst	FPUSED_FLAG, SAVED_LOFAULT
+	bz	1f
+	  andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
+
+	membar	#Sync
+
+	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
+	wr	%o2, 0, %gsr		! restore gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz	4f
+	  nop
+
+	! restore fpregs from stack
+	membar	#Sync
+	add	%fp, STACK_BIAS - 257, %o2
+	and	%o2, -64, %o2
+	ldda	[%o2]ASI_BLK_P, %d0
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d16
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d32
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d48
+	membar	#Sync
+
+	ba,pt	%ncc, 1f
+	  wr	%o3, 0, %fprs		! restore fprs
+
+4:
+	FZERO				! zero all of the fpregs
+	wr	%o3, 0, %fprs		! restore fprs
+
+1:
+
+	restore
+
+	mov	SAVE_SRC, %o0
+	mov	SAVE_DST, %o1
+	jmp	REAL_LOFAULT
+	  mov	SAVE_COUNT, %o2
+	SET_SIZE(copyio_fault)
+
+	ENTRY(copyio_fault_nowindow)
+	membar	#Sync
+	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+
+	mov	SAVE_SRC, %o0
+	mov	SAVE_DST, %o1
+	jmp	REAL_LOFAULT
+	  mov	SAVE_COUNT, %o2
+	SET_SIZE(copyio_fault_nowindow)
+
+	ENTRY(copyout)
+	sethi	%hi(.copyout_err), REAL_LOFAULT
+	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
+
+.do_copyout:
+	!
+	! Check the length and bail if zero.
+	!
+	tst	%o2
+	bnz,pt	%ncc, 1f
+	  nop
+	retl
+	  clr	%o0
+1:
+	sethi	%hi(copyio_fault), %o4
+	or	%o4, %lo(copyio_fault), %o4
+	sethi	%hi(copyio_fault_nowindow), %o3
+	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
+	or	%o3, %lo(copyio_fault_nowindow), %o3
+	membar	#Sync
+	stn	%o3, [THREAD_REG + T_LOFAULT]
+
+	mov	%o0, SAVE_SRC
+	mov	%o1, SAVE_DST
+	mov	%o2, SAVE_COUNT
+
+	!
+	! Check to see if we're more than SMALL_LIMIT (7 bytes).
+	! Run in leaf mode, using the %o regs as our input regs.
+	!
+	subcc	%o2, SMALL_LIMIT, %o3
+	bgu,a,pt %ncc, .dco_ns
+	or	%o0, %o1, %o3
+	!
+	! What was previously ".small_copyout"
+	! Do full differenced copy.
+	!
+.dcobcp:
+	sub	%g0, %o2, %o3		! negate count
+	add	%o0, %o2, %o0		! make %o0 point at the end
+	add	%o1, %o2, %o1		! make %o1 point at the end
+	ba,pt	%ncc, .dcocl
+	ldub	[%o0 + %o3], %o4	! load first byte
+	!
+	! %o0 and %o2 point at the end and remain pointing at the end
+	! of their buffers. We pull things out by adding %o3 (which is
+	! the negation of the length) to the buffer end which gives us
+	! the curent location in the buffers. By incrementing %o3 we walk
+	! through both buffers without having to bump each buffer's
+	! pointer. A very fast 4 instruction loop.
+	!
+	.align 16
+.dcocl:
+	stba	%o4, [%o1 + %o3]ASI_USER
+	inccc	%o3
+	bl,a,pt	%ncc, .dcocl
+	ldub	[%o0 + %o3], %o4
+	!
+	! We're done. Go home.
+	!
+	membar	#Sync
+	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
+	retl
+	clr	%o0
+	!
+	! Try aligned copies from here.
+	!
+.dco_ns:
+	! %o0 = kernel addr (to be copied from)
+	! %o1 = user addr (to be copied to)
+	! %o2 = length
+	! %o3 = %o1 | %o2 (used for alignment checking)
+	! %o4 is alternate lo_fault
+	! %o5 is original lo_fault
+	!
+	! See if we're single byte aligned. If we are, check the
+	! limit for single byte copies. If we're smaller or equal,
+	! bounce to the byte for byte copy loop. Otherwise do it in
+	! HW (if enabled).
+	!
+	btst	1, %o3
+	bz,pt	%icc, .dcoh8
+	btst	7, %o3
+	!
+	! Single byte aligned. Do we do it via HW or via
+	! byte for byte? Do a quick no memory reference
+	! check to pick up small copies.
+	!
+	subcc	%o2, VIS_COPY_THRESHOLD, %o3
+	bleu,pt	%ncc, .dcobcp
+	sethi	%hi(hw_copy_limit_1), %o3
+	!
+	! Big enough that we need to check the HW limit for
+	! this size copy.
+	!
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	!
+	! Is HW copy on? If not, do everything byte for byte.
+	!
+	tst	%o3
+	bz,pn	%icc, .dcobcp
+	subcc	%o3, %o2, %o3
+	!
+	! If we're less than or equal to the single byte copy limit,
+	! bop to the copy loop.
+	!
+	bge,pt	%ncc, .dcobcp
+	nop
+	!
+	! We're big enough and copy is on. Do it with HW.
+	!
+	ba,pt	%ncc, .big_copyout
+	nop
+.dcoh8:
+	!
+	! 8 byte aligned?
+	!
+	bnz,a	%ncc, .dcoh4
+	btst	3, %o3
+	!
+	! See if we're in the "small range".
+	! If so, go off and do the copy.
+	! If not, load the hard limit. %o3 is
+	! available for reuse.
+	!
+	subcc	%o2, VIS_COPY_THRESHOLD, %o3
+	bleu,pt	%ncc, .dcos8
+	sethi	%hi(hw_copy_limit_8), %o3
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	!
+	! If it's zero, there's no HW bcopy.
+	! Bop off to the aligned copy.
+	!
+	tst	%o3
+	bz,pn	%icc, .dcos8
+	subcc	%o3, %o2, %o3
+	!
+	! We're negative if our size is larger than hw_copy_limit_8.
+	!
+	bge,pt	%ncc, .dcos8
+	nop
+	!
+	! HW assist is on and we're large enough. Do it.
+	!
+	ba,pt	%ncc, .big_copyout
+	nop
+.dcos8:
+	!
+	! Housekeeping for copy loops. Uses same idea as in the byte for
+	! byte copy loop above.
+	!
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%ncc, .dodebc
+	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
+	!
+	! 4 byte aligned?
+	!
+.dcoh4:
+	bnz,pn	%ncc, .dcoh2
+	!
+	! See if we're in the "small range".
+	! If so, go off an do the copy.
+	! If not, load the hard limit. %o3 is
+	! available for reuse.
+	!
+	subcc	%o2, VIS_COPY_THRESHOLD, %o3
+	bleu,pt	%ncc, .dcos4
+	sethi	%hi(hw_copy_limit_4), %o3
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	!
+	! If it's zero, there's no HW bcopy.
+	! Bop off to the aligned copy.
+	!
+	tst	%o3
+	bz,pn	%icc, .dcos4
+	subcc	%o3, %o2, %o3
+	!
+	! We're negative if our size is larger than hw_copy_limit_4.
+	!
+	bge,pt	%ncc, .dcos4
+	nop
+	!
+	! HW assist is on and we're large enough. Do it.
+	!
+	ba,pt	%ncc, .big_copyout
+	nop
+.dcos4:
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%ncc, .dodfbc
+	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
+	!
+	! We must be 2 byte aligned. Off we go.
+	! The check for small copies was done in the
+	! delay at .dcoh4
+	!
+.dcoh2:
+	ble	%ncc, .dcos2
+	sethi	%hi(hw_copy_limit_2), %o3
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	tst	%o3
+	bz,pn	%icc, .dcos2
+	subcc	%o3, %o2, %o3
+	bge,pt	%ncc, .dcos2
+	nop
+	!
+	! HW is on and we're big enough. Do it.
+	!
+	ba,pt	%ncc, .big_copyout
+	nop
+.dcos2:
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%ncc, .dodtbc
+	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
+.small_copyout:
+	!
+	! Why are we doing this AGAIN? There are certain conditions in
+	! big_copyout that will cause us to forego the HW assisted copies
+	! and bounce back to a non-HW assisted copy. This dispatches those
+	! copies. Note that we branch around this in the main line code.
+	!
+	! We make no check for limits or HW enablement here. We've
+	! already been told that we're a poster child so just go off
+	! and do it.
+	!
+	or	%o0, %o1, %o3
+	btst	1, %o3
+	bnz	%icc, .dcobcp		! Most likely
+	btst	7, %o3
+	bz	%icc, .dcos8
+	btst	3, %o3
+	bz	%icc, .dcos4
+	nop
+	ba,pt	%ncc, .dcos2
+	nop
+	.align 32
+.dodebc:
+	ldx	[%o0 + %o3], %o4
+	deccc	%o2
+	stxa	%o4, [%o1 + %o3]ASI_USER
+	bg,pt	%ncc, .dodebc
+	addcc	%o3, 8, %o3
+	!
+	! End of copy loop. Check to see if we're done. Most
+	! eight byte aligned copies end here.
+	!
+	bz,pt	%ncc, .dcofh
+	nop
+	!
+	! Something is left - do it byte for byte.
+	! 
+	ba,pt	%ncc, .dcocl
+	ldub	[%o0 + %o3], %o4	! load next byte
+	!
+	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
+	!
+	.align 32
+.dodfbc:
+	lduw	[%o0 + %o3], %o4
+	deccc	%o2
+	sta	%o4, [%o1 + %o3]ASI_USER
+	bg,pt	%ncc, .dodfbc
+	addcc	%o3, 4, %o3
+	!
+	! End of copy loop. Check to see if we're done. Most
+	! four byte aligned copies end here.
+	!
+	bz,pt	%ncc, .dcofh
+	nop
+	!
+	! Something is left. Do it byte for byte.
+	!
+	ba,pt	%ncc, .dcocl
+	ldub	[%o0 + %o3], %o4	! load next byte
+	!
+	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
+	! copy.
+	!
+	.align 32
+.dodtbc:
+	lduh	[%o0 + %o3], %o4
+	deccc	%o2
+	stha	%o4, [%o1 + %o3]ASI_USER
+	bg,pt	%ncc, .dodtbc
+	addcc	%o3, 2, %o3
+	!
+	! End of copy loop. Anything left?
+	!
+	bz,pt	%ncc, .dcofh
+	nop
+	!
+	! Deal with the last byte
+	!
+	ldub	[%o0 + %o3], %o4
+	stba	%o4, [%o1 + %o3]ASI_USER
+.dcofh:
+	membar	#Sync
+	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	clr	%o0
+
+.big_copyout:
+	!
+	! Are we using the FP registers?
+	!
+	rd	%fprs, %o3			! check for unused fp
+	btst	FPRS_FEF, %o3
+	bnz	%icc, .copyout_fpregs_inuse
+	nop
+	!
+	! We're going to go off and do a block copy.
+	! Switch fault hendlers and grab a window. We
+	! don't do a membar #Sync since we've done only
+	! kernel data to this point.
+	!
+	stn	%o4, [THREAD_REG + T_LOFAULT]
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	!
+	! %o3 is now %i3. Save original %fprs.
+	!
+	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
+	ba,pt	%ncc, .do_block_copyout		! Not in use. Go off and do it.
+	wr	%g0, FPRS_FEF, %fprs		! clear %fprs
+	!
+.copyout_fpregs_inuse:
+	!
+	! We're here if the FP regs are in use. Need to see if the request
+	! exceeds our suddenly larger minimum.
+	!
+	cmp	%i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
+	bl	%ncc, .small_copyout
+	  nop
+	!
+	! We're going to go off and do a block copy.
+	! Change to the heavy duty fault handler and grab a window first.
+	!
+	stn	%o4, [THREAD_REG + T_LOFAULT]
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
+	!
+	! save in-use fpregs on stack
+	!
+	wr	%g0, FPRS_FEF, %fprs
+	membar	#Sync
+	add	%fp, STACK_BIAS - 257, %o2
+	and	%o2, -64, %o2
+	stda	%d0, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d16, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d32, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d48, [%o2]ASI_BLK_P
+	membar	#Sync
+
+.do_block_copyout:
+	membar	#StoreStore|#StoreLoad|#LoadStore
+
+	rd	%gsr, %o2
+	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
+
+	! Set the lower bit in the saved t_lofault to indicate
+	! that we need to clear the %fprs register on the way
+	! out
+	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 
+
+	! Swap src/dst since the code below is memcpy code
+	! and memcpy/bcopy have different calling sequences
+	mov	%i1, %i5
+	mov	%i0, %i1
+	mov	%i5, %i0
+
+!!! This code is nearly identical to the version in the sun4u
+!!! libc_psr.  Most bugfixes made to that file should be
+!!! merged into this routine.
+
+	andcc	%i0, 7, %o3
+	bz	%ncc, copyout_blkcpy
+	sub	%o3, 8, %o3
+	neg	%o3
+	sub	%i2, %o3, %i2
+
+	! Align Destination on double-word boundary
+
+2:	ldub	[%i1], %o4
+	inc	%i1
+	stba	%o4, [%i0]ASI_USER
+	deccc	%o3
+	bgu	%ncc, 2b
+	  inc	%i0
+copyout_blkcpy:
+	andcc	%i0, 63, %i3
+	bz,pn	%ncc, copyout_blalign	! now block aligned
+	sub	%i3, 64, %i3
+	neg	%i3			! bytes till block aligned
+	sub	%i2, %i3, %i2		! update %i2 with new count
+
+	! Copy %i3 bytes till dst is block (64 byte) aligned. use
+	! double word copies.
+
+	alignaddr %i1, %g0, %g1
+	ldd	[%g1], %d0
+	add	%g1, 8, %g1
+6:
+	ldd	[%g1], %d2
+	add	%g1, 8, %g1
+	subcc	%i3, 8, %i3
+	faligndata %d0, %d2, %d8
+	stda	 %d8, [%i0]ASI_USER
+	add	%i1, 8, %i1
+	bz,pn	%ncc, copyout_blalign
+	add	%i0, 8, %i0
+	ldd	[%g1], %d0
+	add	%g1, 8, %g1
+	subcc	%i3, 8, %i3
+	faligndata %d2, %d0, %d8
+	stda	 %d8, [%i0]ASI_USER
+	add	%i1, 8, %i1
+	bgu,pn	%ncc, 6b
+	add	%i0, 8, %i0
+ 
+copyout_blalign:
+	membar	#StoreLoad
+	! %i2 = total length
+	! %i3 = blocks	(length - 64) / 64
+	! %i4 = doubles remaining  (length - blocks)
+	sub	%i2, 64, %i3
+	andn	%i3, 63, %i3
+	sub	%i2, %i3, %i4
+	andn	%i4, 7, %i4
+	sub	%i4, 16, %i4
+	sub	%i2, %i4, %i2
+	sub	%i2, %i3, %i2
+
+	andn	%i1, 0x3f, %l7		! blk aligned address
+	alignaddr %i1, %g0, %g0		! gen %gsr
+
+	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
+	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
+	add	%i1, %i4, %i1
+	add	%i1, %i3, %i1
+
+	ldda	[%l7]ASI_BLK_P, %d0
+	add	%l7, 64, %l7
+	ldda	[%l7]ASI_BLK_P, %d16
+	add	%l7, 64, %l7
+	ldda	[%l7]ASI_BLK_P, %d32
+	add	%l7, 64, %l7
+	sub	%i3, 128, %i3
+
+	! switch statement to get us to the right 8 byte blk within a
+	! 64 byte block
+
+	cmp	 %i5, 4
+	bgeu,a	 copyout_hlf
+	cmp	 %i5, 6
+	cmp	 %i5, 2
+	bgeu,a	 copyout_sqtr
+	nop
+	cmp	 %i5, 1
+	be,a	 copyout_seg1
+	nop
+	ba,pt	 %ncc, copyout_seg0
+	nop
+copyout_sqtr:
+	be,a	 copyout_seg2
+	nop
+	ba,pt	 %ncc, copyout_seg3
+	nop
+
+copyout_hlf:
+	bgeu,a	 copyout_fqtr
+	nop	 
+	cmp	 %i5, 5
+	be,a	 copyout_seg5
+	nop
+	ba,pt	 %ncc, copyout_seg4
+	nop
+copyout_fqtr:
+	be,a	 copyout_seg6
+	nop
+	ba,pt	 %ncc, copyout_seg7
+	nop
+	
+copyout_seg0:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D0
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D16
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D32
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyout_seg0
+
+0:
+	FALIGN_D16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd0
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd16
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd32
+	add	%i0, 64, %i0
+
+copyout_seg1:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D2
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D18
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D34
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyout_seg1
+0:
+	FALIGN_D18
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D34
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd2
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D34
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D2
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd18
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D2
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D18
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd34
+	add	%i0, 64, %i0
+
+copyout_seg2:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D4
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D20
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D36
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyout_seg2
+
+0:
+	FALIGN_D20
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D36
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd4
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D36
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D4
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd20
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D4
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D20
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd36
+	add	%i0, 64, %i0
+
+copyout_seg3:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D6
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D22
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D38
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyout_seg3
+
+0:
+	FALIGN_D22
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D38
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd6
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D38
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D6
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd22
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D6
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D22
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd38
+	add	%i0, 64, %i0
+
+copyout_seg4:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D8
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D24
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D40
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyout_seg4
+
+0:
+	FALIGN_D24
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D40
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd8
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D40
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D8
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd24
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D8
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D24
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd40
+	add	%i0, 64, %i0
+
+copyout_seg5:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D10
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D26
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D42
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyout_seg5
+
+0:
+	FALIGN_D26
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D42
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd10
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D42
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D10
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd26
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D10
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D26
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd42
+	add	%i0, 64, %i0
+
+copyout_seg6:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D12
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D28
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D44
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyout_seg6
+
+0:
+	FALIGN_D28
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D44
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd12
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D44
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D12
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd28
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D12
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D28
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd44
+	add	%i0, 64, %i0
+
+copyout_seg7:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D14
+	ldda	[%l7]ASI_BLK_P, %d0
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D30
+	ldda	[%l7]ASI_BLK_P, %d16
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D46
+	ldda	[%l7]ASI_BLK_P, %d32
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyout_seg7
+
+0:
+	FALIGN_D30
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D46
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd14
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D46
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D14
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd30
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D14
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D30
+	stda	%d48, [%i0]ASI_BLK_AIUS
+	ba,pt	%ncc, copyout_blkd46
+	add	%i0, 64, %i0
+
+
+	!
+	! dribble out the last partial block
+	!
+copyout_blkd0:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d0, %d2, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd2:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d2, %d4, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd4:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d4, %d6, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd6:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d6, %d8, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd8:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d8, %d10, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd10:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d10, %d12, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd12:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d12, %d14, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd14:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	fsrc1	%d14, %d0
+	ba,a,pt	%ncc, copyout_blkleft
+
+copyout_blkd16:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d16, %d18, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd18:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d18, %d20, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd20:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d20, %d22, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd22:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d22, %d24, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd24:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d24, %d26, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd26:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d26, %d28, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd28:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d28, %d30, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd30:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	fsrc1	%d30, %d0
+	ba,a,pt	%ncc, copyout_blkleft
+copyout_blkd32:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d32, %d34, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd34:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d34, %d36, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd36:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d36, %d38, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd38:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d38, %d40, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd40:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d40, %d42, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd42:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d42, %d44, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd44:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	faligndata %d44, %d46, %d48
+	stda	%d48, [%i0]ASI_USER
+	add	%i0, 8, %i0
+copyout_blkd46:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyout_blkdone
+	fsrc1	%d46, %d0
+
+copyout_blkleft:
+1:
+	ldd	[%l7], %d2
+	add	%l7, 8, %l7
+	subcc	%i4, 8, %i4
+	faligndata %d0, %d2, %d8
+	stda	%d8, [%i0]ASI_USER
+	blu,pn	%ncc, copyout_blkdone
+	add	%i0, 8, %i0
+	ldd	[%l7], %d0
+	add	%l7, 8, %l7
+	subcc	%i4, 8, %i4
+	faligndata %d2, %d0, %d8
+	stda	%d8, [%i0]ASI_USER
+	bgeu,pt	%ncc, 1b
+	add	%i0, 8, %i0
+
+copyout_blkdone:
+	tst	%i2
+	bz,pt	%ncc, .copyout_exit
+	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
+
+7:	ldub	[%i1], %i4
+	inc	%i1
+	stba	%i4, [%i0]ASI_USER
+	inc	%i0
+	deccc	%i2
+	bgu	%ncc, 7b
+	  nop
+
+.copyout_exit:
+	membar	#StoreLoad|#StoreStore
+	btst	FPUSED_FLAG, SAVED_LOFAULT
+	bz	1f
+	  nop
+
+	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
+	wr	%o2, 0, %gsr		! restore gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz	4f
+	  nop
+
+	! restore fpregs from stack
+	membar	#Sync
+	add	%fp, STACK_BIAS - 257, %o2
+	and	%o2, -64, %o2
+	ldda	[%o2]ASI_BLK_P, %d0
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d16
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d32
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d48
+	membar	#Sync
+
+	ba,pt	%ncc, 1f
+	  wr	%o3, 0, %fprs		! restore fprs
+
+4:
+	FZERO				! zero all of the fpregs
+	wr	%o3, 0, %fprs		! restore fprs
+
+1:
+	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
+	membar	#Sync			! sync error barrier
+	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	ret
+	restore	%g0, 0, %o0
+
+.copyout_err:
+	ldn	[THREAD_REG + T_COPYOPS], %o4
+	brz	%o4, 2f
+	nop
+	ldn	[%o4 + CP_COPYOUT], %g2
+	jmp	%g2
+	nop
+2:
+	retl
+	mov	-1, %o0
+	SET_SIZE(copyout)
+
+#endif	/* lint */
+
+
+#ifdef	lint
+
+/*ARGSUSED*/
+int
+xcopyout(const void *kaddr, void *uaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(xcopyout)
+	sethi	%hi(.xcopyout_err), REAL_LOFAULT
+	b	.do_copyout
+	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
+.xcopyout_err:
+	ldn	[THREAD_REG + T_COPYOPS], %o4
+	brz	%o4, 2f
+	nop
+	ldn	[%o4 + CP_XCOPYOUT], %g2
+	jmp	%g2
+	nop
+2:
+	retl
+	mov	%g1, %o0
+	SET_SIZE(xcopyout)
+
+#endif	/* lint */
+	
+#ifdef	lint
+
+/*ARGSUSED*/
+int
+xcopyout_little(const void *kaddr, void *uaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(xcopyout_little)
+	sethi	%hi(.little_err), %o4
+	ldn	[THREAD_REG + T_LOFAULT], %o5
+	or	%o4, %lo(.little_err), %o4
+	membar	#Sync			! sync error barrier
+	stn	%o4, [THREAD_REG + T_LOFAULT]
+
+	subcc	%g0, %o2, %o3
+	add	%o0, %o2, %o0
+	bz,pn	%ncc, 2f		! check for zero bytes
+	sub	%o2, 1, %o4
+	add	%o0, %o4, %o0		! start w/last byte
+	add	%o1, %o2, %o1
+	ldub	[%o0+%o3], %o4
+
+1:	stba	%o4, [%o1+%o3]ASI_AIUSL
+	inccc	%o3
+	sub	%o0, 2, %o0		! get next byte
+	bcc,a,pt %ncc, 1b
+	  ldub	[%o0+%o3], %o4
+
+2:	membar	#Sync			! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	mov	%g0, %o0		! return (0)
+	SET_SIZE(xcopyout_little)
+
+#endif	/* lint */
+
+/*
+ * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
+ */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+int
+copyin(const void *uaddr, void *kaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(copyin)
+	sethi	%hi(.copyin_err), REAL_LOFAULT
+	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
+
+.do_copyin:
+	!
+	! Check the length and bail if zero.
+	!
+	tst	%o2
+	bnz,pt	%ncc, 1f
+	  nop
+	retl
+	  clr	%o0
+1:
+	sethi	%hi(copyio_fault), %o4
+	or	%o4, %lo(copyio_fault), %o4
+	sethi	%hi(copyio_fault_nowindow), %o3
+	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
+	or	%o3, %lo(copyio_fault_nowindow), %o3
+	membar	#Sync
+	stn	%o3, [THREAD_REG + T_LOFAULT]
+
+	mov	%o0, SAVE_SRC
+	mov	%o1, SAVE_DST
+	mov	%o2, SAVE_COUNT
+
+	!
+	! Check to see if we're more than SMALL_LIMIT.
+	!
+	subcc	%o2, SMALL_LIMIT, %o3
+	bgu,a,pt %ncc, .dci_ns
+	or	%o0, %o1, %o3
+	!
+	! What was previously ".small_copyin"
+	!
+.dcibcp:
+	sub	%g0, %o2, %o3		! setup for copy loop
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	ba,pt	%ncc, .dcicl
+	lduba	[%o0 + %o3]ASI_USER, %o4
+	!
+	! %o0 and %o1 point at the end and remain pointing at the end
+	! of their buffers. We pull things out by adding %o3 (which is
+	! the negation of the length) to the buffer end which gives us
+	! the curent location in the buffers. By incrementing %o3 we walk
+	! through both buffers without having to bump each buffer's
+	! pointer. A very fast 4 instruction loop.
+	!
+	.align 16
+.dcicl:
+	stb	%o4, [%o1 + %o3]
+	inccc	%o3
+	bl,a,pt %ncc, .dcicl
+	lduba	[%o0 + %o3]ASI_USER, %o4
+	!
+	! We're done. Go home.
+	!	
+	membar	#Sync
+	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
+	retl
+	clr	%o0
+	!
+	! Try aligned copies from here.
+	!
+.dci_ns:
+	!
+	! See if we're single byte aligned. If we are, check the
+	! limit for single byte copies. If we're smaller, or equal,
+	! bounce to the byte for byte copy loop. Otherwise do it in
+	! HW (if enabled).
+	!
+	btst	1, %o3
+	bz,a,pt	%icc, .dcih8
+	btst	7, %o3
+	!
+	! We're single byte aligned.
+	!
+	subcc	%o2, VIS_COPY_THRESHOLD, %o3
+	bleu,pt	%ncc, .dcibcp
+	sethi	%hi(hw_copy_limit_1), %o3
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	!
+	! Is HW copy on? If not do everything byte for byte.
+	!
+	tst	%o3
+	bz,pn	%icc, .dcibcp
+	subcc	%o3, %o2, %o3
+	!
+	! Are we bigger than the HW limit? If not
+	! go to byte for byte.
+	!
+	bge,pt	%ncc, .dcibcp
+	nop
+	!
+	! We're big enough and copy is on. Do it with HW.
+	!
+	ba,pt	%ncc, .big_copyin
+	nop
+.dcih8:
+	!
+	! 8 byte aligned?
+	!
+	bnz,a	%ncc, .dcih4
+	btst	3, %o3
+	!
+	! We're eight byte aligned.
+	!
+	subcc	%o2, VIS_COPY_THRESHOLD, %o3
+	bleu,pt	%ncc, .dcis8
+	sethi	%hi(hw_copy_limit_8), %o3
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	!
+	! Is HW assist on? If not, do it with the aligned copy.
+	!
+	tst	%o3
+	bz,pn	%icc, .dcis8
+	subcc	%o3, %o2, %o3
+	bge	%ncc, .dcis8
+	nop
+	ba,pt	%ncc, .big_copyin
+	nop
+.dcis8:
+	!
+	! Housekeeping for copy loops. Uses same idea as in the byte for
+	! byte copy loop above.
+	!
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%ncc, .didebc
+	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
+	!
+	! 4 byte aligned?
+	!
+.dcih4:
+	bnz	%ncc, .dcih2
+	subcc	%o2, VIS_COPY_THRESHOLD, %o3
+	bleu,pt	%ncc, .dcis4
+	sethi	%hi(hw_copy_limit_4), %o3
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	!
+	! Is HW assist on? If not, do it with the aligned copy.
+	!
+	tst	%o3
+	bz,pn	%icc, .dcis4
+	subcc	%o3, %o2, %o3
+	!
+	! We're negative if our size is less than or equal to hw_copy_limit_4.
+	!
+	bge	%ncc, .dcis4
+	nop
+	ba,pt	%ncc, .big_copyin
+	nop
+.dcis4:
+	!
+	! Housekeeping for copy loops. Uses same idea as in the byte
+	! for byte copy loop above.
+	!
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%ncc, .didfbc
+	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
+.dcih2:
+	!
+	! We're two byte aligned. Check for "smallness"
+	! done in delay at .dcih4
+	!
+	bleu,pt	%ncc, .dcis2
+	sethi	%hi(hw_copy_limit_2), %o3
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	!
+	! Is HW assist on? If not, do it with the aligned copy.
+	!
+	tst	%o3
+	bz,pn	%icc, .dcis2
+	subcc	%o3, %o2, %o3
+	!
+	! Are we larger than the HW limit?
+	!
+	bge	%ncc, .dcis2
+	nop
+	!
+	! HW assist is on and we're large enough to use it.
+	!
+	ba,pt	%ncc, .big_copyin
+	nop
+	!
+	! Housekeeping for copy loops. Uses same idea as in the byte
+	! for byte copy loop above.
+	!
+.dcis2:
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%ncc, .didtbc
+	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
+	!
+.small_copyin:
+	!
+	! Why are we doing this AGAIN? There are certain conditions in
+	! big copyin that will cause us to forgo the HW assisted copys
+	! and bounce back to a non-hw assisted copy. This dispatches
+	! those copies. Note that we branch around this in the main line
+	! code.
+	!
+	! We make no check for limits or HW enablement here. We've
+	! already been told that we're a poster child so just go off
+	! and do it.
+	!
+	or	%o0, %o1, %o3
+	btst	1, %o3
+	bnz	%icc, .dcibcp		! Most likely
+	btst	7, %o3
+	bz	%icc, .dcis8
+	btst	3, %o3
+	bz	%icc, .dcis4
+	nop
+	ba,pt	%ncc, .dcis2
+	nop
+	!
+	! Eight byte aligned copies. A steal from the original .small_copyin
+	! with modifications. %o2 is number of 8 byte chunks to copy. When
+	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
+	! to copy.
+	!
+	.align 32
+.didebc:
+	ldxa	[%o0 + %o3]ASI_USER, %o4
+	deccc	%o2
+	stx	%o4, [%o1 + %o3]
+	bg,pt	%ncc, .didebc
+	addcc	%o3, 8, %o3
+	!
+	! End of copy loop. Most 8 byte aligned copies end here.
+	!
+	bz,pt	%ncc, .dcifh
+	nop
+	!
+	! Something is left. Do it byte for byte.
+	!
+	ba,pt	%ncc, .dcicl
+	lduba	[%o0 + %o3]ASI_USER, %o4
+	!
+	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
+	!
+	.align 32
+.didfbc:
+	lduwa	[%o0 + %o3]ASI_USER, %o4
+	deccc	%o2
+	st	%o4, [%o1 + %o3]
+	bg,pt	%ncc, .didfbc
+	addcc	%o3, 4, %o3
+	!
+	! End of copy loop. Most 4 byte aligned copies end here.
+	!
+	bz,pt	%ncc, .dcifh
+	nop
+	!
+	! Something is left. Do it byte for byte.
+	!
+	ba,pt	%ncc, .dcicl
+	lduba	[%o0 + %o3]ASI_USER, %o4
+	!
+	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
+	! copy.
+	!
+	.align 32
+.didtbc:
+	lduha	[%o0 + %o3]ASI_USER, %o4
+	deccc	%o2
+	sth	%o4, [%o1 + %o3]
+	bg,pt	%ncc, .didtbc
+	addcc	%o3, 2, %o3
+	!
+	! End of copy loop. Most 2 byte aligned copies end here.
+	!
+	bz,pt	%ncc, .dcifh
+	nop
+	!
+	! Deal with the last byte
+	!
+	lduba	[%o0 + %o3]ASI_USER, %o4
+	stb	%o4, [%o1 + %o3]
+.dcifh:
+	membar	#Sync
+	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
+	retl
+	clr	%o0
+
+.big_copyin:
+	!
+	! Are we using the FP registers?
+	!
+	rd	%fprs, %o3		! check for unused fp
+	btst	FPRS_FEF, %o3
+	bnz	%ncc, .copyin_fpregs_inuse
+	nop
+	!
+	! We're going off to do a block copy.
+	! Switch fault hendlers and grab a window. We
+	! don't do a membar #Sync since we've done only
+	! kernel data to this point.
+	!
+	stn	%o4, [THREAD_REG + T_LOFAULT]
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	!
+	! %o3 is %i3 after the save...
+	!
+	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
+	ba,pt	%ncc, .do_blockcopyin
+	wr	%g0, FPRS_FEF, %fprs
+.copyin_fpregs_inuse:
+	!
+	! We're here if the FP regs are in use. Need to see if the request
+	! exceeds our suddenly larger minimum.
+	!
+	cmp	%i2, VIS_COPY_THRESHOLD+(64*4)
+	bl	%ncc, .small_copyin
+	nop
+	!
+	! We're going off and do a block copy.
+	! Change to the heavy duty fault handler and grab a window first.
+	! New handler is passed in
+	!
+	stn	%o4, [THREAD_REG + T_LOFAULT]
+	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
+	!
+	! %o3 is now %i3
+	!
+	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
+
+	! save in-use fpregs on stack
+	wr	%g0, FPRS_FEF, %fprs
+	membar	#Sync
+	add	%fp, STACK_BIAS - 257, %o2
+	and	%o2, -64, %o2
+	stda	%d0, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d16, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d32, [%o2]ASI_BLK_P
+	add	%o2, 64, %o2
+	stda	%d48, [%o2]ASI_BLK_P
+	membar	#Sync
+
+.do_blockcopyin:
+	membar	#StoreStore|#StoreLoad|#LoadStore
+
+	rd	%gsr, %o2
+	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
+
+	! Set the lower bit in the saved t_lofault to indicate
+	! that we need to clear the %fprs register on the way
+	! out
+	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
+
+	! Swap src/dst since the code below is memcpy code
+	! and memcpy/bcopy have different calling sequences
+	mov	%i1, %i5
+	mov	%i0, %i1
+	mov	%i5, %i0
+
+!!! This code is nearly identical to the version in the sun4u
+!!! libc_psr.  Most bugfixes made to that file should be
+!!! merged into this routine.
+
+	andcc	%i0, 7, %o3
+	bz	copyin_blkcpy
+	sub	%o3, 8, %o3
+	neg	%o3
+	sub	%i2, %o3, %i2
+
+	! Align Destination on double-word boundary
+
+2:	lduba	[%i1]ASI_USER, %o4
+	inc	%i1
+	inc	%i0
+	deccc	%o3
+	bgu	%ncc, 2b
+	stb	%o4, [%i0-1]
+copyin_blkcpy:
+	andcc	%i0, 63, %i3
+	bz,pn	%ncc, copyin_blalign	! now block aligned
+	sub	%i3, 64, %i3
+	neg	%i3			! bytes till block aligned
+	sub	%i2, %i3, %i2		! update %i2 with new count
+
+	! Copy %i3 bytes till dst is block (64 byte) aligned. use
+	! double word copies.
+
+	alignaddr %i1, %g0, %g1
+	ldda	[%g1]ASI_USER, %d0
+	add	%g1, 8, %g1
+6:
+	ldda	[%g1]ASI_USER, %d2
+	add	%g1, 8, %g1
+	subcc	%i3, 8, %i3
+	faligndata %d0, %d2, %d8
+	std	%d8, [%i0]
+	add	%i1, 8, %i1
+	bz,pn	%ncc, copyin_blalign
+	add	%i0, 8, %i0
+	ldda	[%g1]ASI_USER, %d0
+	add	%g1, 8, %g1
+	subcc	%i3, 8, %i3
+	faligndata %d2, %d0, %d8
+	std	%d8, [%i0]
+	add	%i1, 8, %i1
+	bgu,pn	%ncc, 6b
+	add	%i0, 8, %i0
+ 
+copyin_blalign:
+	membar	#StoreLoad
+	! %i2 = total length
+	! %i3 = blocks	(length - 64) / 64
+	! %i4 = doubles remaining  (length - blocks)
+	sub	%i2, 64, %i3
+	andn	%i3, 63, %i3
+	sub	%i2, %i3, %i4
+	andn	%i4, 7, %i4
+	sub	%i4, 16, %i4
+	sub	%i2, %i4, %i2
+	sub	%i2, %i3, %i2
+
+	andn	%i1, 0x3f, %l7		! blk aligned address
+	alignaddr %i1, %g0, %g0		! gen %gsr
+
+	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
+	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
+	add	%i1, %i4, %i1
+	add	%i1, %i3, %i1
+
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	add	%l7, 64, %l7
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	add	%l7, 64, %l7
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	add	%l7, 64, %l7
+	sub	%i3, 128, %i3
+
+	! switch statement to get us to the right 8 byte blk within a
+	! 64 byte block
+
+	cmp	 %i5, 4
+	bgeu,a	 copyin_hlf
+	cmp	 %i5, 6
+	cmp	 %i5, 2
+	bgeu,a	 copyin_sqtr
+	nop
+	cmp	 %i5, 1
+	be,a	 copyin_seg1
+	nop
+	ba,pt	 %ncc, copyin_seg0
+	nop
+copyin_sqtr:
+	be,a	 copyin_seg2
+	nop
+	ba,pt	 %ncc, copyin_seg3
+	nop
+
+copyin_hlf:
+	bgeu,a	 copyin_fqtr
+	nop	 
+	cmp	 %i5, 5
+	be,a	 copyin_seg5
+	nop
+	ba,pt	 %ncc, copyin_seg4
+	nop
+copyin_fqtr:
+	be,a	 copyin_seg6
+	nop
+	ba,pt	 %ncc, copyin_seg7
+	nop
+	
+copyin_seg0:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D0
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D16
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D32
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyin_seg0
+
+0:
+	FALIGN_D16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D32
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd0
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D0
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd16
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D16
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd32
+	add	%i0, 64, %i0
+
+copyin_seg1:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D2
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D18
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D34
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyin_seg1
+0:
+	FALIGN_D18
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D34
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd2
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D34
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D2
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd18
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D2
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D18
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd34
+	add	%i0, 64, %i0
+copyin_seg2:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D4
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D20
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D36
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyin_seg2
+
+0:
+	FALIGN_D20
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D36
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd4
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D36
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D4
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd20
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D4
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D20
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd36
+	add	%i0, 64, %i0
+
+copyin_seg3:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D6
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D22
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D38
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyin_seg3
+
+0:
+	FALIGN_D22
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D38
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd6
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D38
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D6
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd22
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D6
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D22
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd38
+	add	%i0, 64, %i0
+
+copyin_seg4:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D8
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D24
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D40
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyin_seg4
+
+0:
+	FALIGN_D24
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D40
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd8
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D40
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D8
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd24
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D8
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D24
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd40
+	add	%i0, 64, %i0
+
+copyin_seg5:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D10
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D26
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D42
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyin_seg5
+
+0:
+	FALIGN_D26
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D42
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd10
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D42
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D10
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd26
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D10
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D26
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd42
+	add	%i0, 64, %i0
+
+copyin_seg6:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D12
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D28
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D44
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyin_seg6
+
+0:
+	FALIGN_D28
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D44
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd12
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D44
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D12
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd28
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D12
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D28
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd44
+	add	%i0, 64, %i0
+
+copyin_seg7:
+	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
+	FALIGN_D14
+	ldda	[%l7]ASI_BLK_AIUS, %d0
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 0f
+	add	%i0, 64, %i0
+	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
+	FALIGN_D30
+	ldda	[%l7]ASI_BLK_AIUS, %d16
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 1f
+	add	%i0, 64, %i0
+	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
+	FALIGN_D46
+	ldda	[%l7]ASI_BLK_AIUS, %d32
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%l7, 64, %l7
+	subcc	%i3, 64, %i3
+	bz,pn	%ncc, 2f
+	add	%i0, 64, %i0
+	ba,a,pt	%ncc, copyin_seg7
+
+0:
+	FALIGN_D30
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D46
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd14
+	add	%i0, 64, %i0
+
+1:
+	FALIGN_D46
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D14
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd30
+	add	%i0, 64, %i0
+
+2:
+	FALIGN_D14
+	stda	%d48, [%i0]ASI_BLK_P
+	add	%i0, 64, %i0
+	membar	#Sync
+	FALIGN_D30
+	stda	%d48, [%i0]ASI_BLK_P
+	ba,pt	%ncc, copyin_blkd46
+	add	%i0, 64, %i0
+
+
+	!
+	! dribble out the last partial block
+	!
+copyin_blkd0:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d0, %d2, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd2:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d2, %d4, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd4:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d4, %d6, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd6:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d6, %d8, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd8:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d8, %d10, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd10:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d10, %d12, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd12:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d12, %d14, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd14:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	fsrc1	%d14, %d0
+	ba,a,pt	%ncc, copyin_blkleft
+
+copyin_blkd16:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d16, %d18, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd18:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d18, %d20, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd20:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d20, %d22, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd22:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d22, %d24, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd24:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d24, %d26, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd26:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d26, %d28, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd28:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d28, %d30, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd30:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	fsrc1	%d30, %d0
+	ba,a,pt	%ncc, copyin_blkleft
+copyin_blkd32:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d32, %d34, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd34:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d34, %d36, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd36:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d36, %d38, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd38:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d38, %d40, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd40:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d40, %d42, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd42:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d42, %d44, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd44:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	faligndata %d44, %d46, %d48
+	std	%d48, [%i0]
+	add	%i0, 8, %i0
+copyin_blkd46:
+	subcc	%i4, 8, %i4
+	blu,pn	%ncc, copyin_blkdone
+	fsrc1	%d46, %d0
+
+copyin_blkleft:
+1:
+	ldda	[%l7]ASI_USER, %d2
+	add	%l7, 8, %l7
+	subcc	%i4, 8, %i4
+	faligndata %d0, %d2, %d8
+	std	%d8, [%i0]
+	blu,pn	%ncc, copyin_blkdone
+	add	%i0, 8, %i0
+	ldda	[%l7]ASI_USER, %d0
+	add	%l7, 8, %l7
+	subcc	%i4, 8, %i4
+	faligndata %d2, %d0, %d8
+	std	%d8, [%i0]
+	bgeu,pt	%ncc, 1b
+	add	%i0, 8, %i0
+
+copyin_blkdone:
+	tst	%i2
+	bz,pt	%ncc, .copyin_exit
+	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
+
+7:	lduba	[%i1]ASI_USER, %i4
+	inc	%i1
+	inc	%i0
+	deccc	%i2
+	bgu	%ncc, 7b
+	  stb	  %i4, [%i0 - 1]
+
+.copyin_exit:
+	membar	#StoreLoad|#StoreStore
+	btst	FPUSED_FLAG, SAVED_LOFAULT
+	bz	%icc, 1f
+	  nop
+
+	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
+	wr	%o2, 0, %gsr
+
+	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
+	btst	FPRS_FEF, %o3
+	bz	%icc, 4f
+	  nop
+
+	! restore fpregs from stack
+	membar	#Sync
+	add	%fp, STACK_BIAS - 257, %o2
+	and	%o2, -64, %o2
+	ldda	[%o2]ASI_BLK_P, %d0
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d16
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d32
+	add	%o2, 64, %o2
+	ldda	[%o2]ASI_BLK_P, %d48
+	membar	#Sync
+
+	ba,pt	%ncc, 1f
+	  wr	%o3, 0, %fprs		! restore fprs
+
+4:
+	FZERO				! zero all of the fpregs
+	wr	%o3, 0, %fprs		! restore fprs
+
+1:
+	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
+	membar	#Sync				! sync error barrier
+	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	ret
+	restore	%g0, 0, %o0
+.copyin_err:
+	ldn	[THREAD_REG + T_COPYOPS], %o4
+	brz	%o4, 2f
+	nop
+	ldn	[%o4 + CP_COPYIN], %g2
+	jmp	%g2
+	nop
+2:
+	retl
+	mov	-1, %o0
+	SET_SIZE(copyin)
+
+#endif	/* lint */
+
+#ifdef	lint
+
+/*ARGSUSED*/
+int
+xcopyin(const void *uaddr, void *kaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(xcopyin)
+	sethi	%hi(.xcopyin_err), REAL_LOFAULT
+	b	.do_copyin
+	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
+.xcopyin_err:
+	ldn	[THREAD_REG + T_COPYOPS], %o4
+	brz	%o4, 2f
+	nop
+	ldn	[%o4 + CP_XCOPYIN], %g2
+	jmp	%g2
+	nop
+2:
+	retl
+	mov	%g1, %o0
+	SET_SIZE(xcopyin)
+
+#endif	/* lint */
+
+#ifdef	lint
+
+/*ARGSUSED*/
+int
+xcopyin_little(const void *uaddr, void *kaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(xcopyin_little)
+	sethi	%hi(.little_err), %o4
+	ldn	[THREAD_REG + T_LOFAULT], %o5
+	or	%o4, %lo(.little_err), %o4
+	membar	#Sync				! sync error barrier
+	stn	%o4, [THREAD_REG + T_LOFAULT]	
+
+	subcc	%g0, %o2, %o3
+	add	%o0, %o2, %o0
+	bz,pn	%ncc, 2f		! check for zero bytes
+	sub	%o2, 1, %o4
+	add	%o0, %o4, %o0		! start w/last byte	
+	add	%o1, %o2, %o1
+	lduba	[%o0+%o3]ASI_AIUSL, %o4
+
+1:	stb	%o4, [%o1+%o3]
+	inccc	%o3
+	sub	%o0, 2, %o0		! get next byte
+	bcc,a,pt %ncc, 1b
+	  lduba	[%o0+%o3]ASI_AIUSL, %o4
+
+2:	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	mov	%g0, %o0		! return (0)
+
+.little_err:
+	membar	#Sync				! sync error barrier
+	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
+	retl
+	mov	%g1, %o0
+	SET_SIZE(xcopyin_little)
+
+#endif	/* lint */
+
+
+/*
+ * Copy a block of storage - must not overlap (from + len <= to).
+ * No fault handler installed (to be called under on_fault())
+ */
+#if defined(lint)
+
+/* ARGSUSED */
+void
+copyin_noerr(const void *ufrom, void *kto, size_t count)
+{}
+
+#else	/* lint */
+
+	ENTRY(copyin_noerr)
+	sethi	%hi(.copyio_noerr), REAL_LOFAULT
+	b	.do_copyin
+	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
+.copyio_noerr:
+	jmp	SAVED_LOFAULT
+	  nop
+	SET_SIZE(copyin_noerr)
+
+#endif /* lint */
+
+/*
+ * Copy a block of storage - must not overlap (from + len <= to).
+ * No fault handler installed (to be called under on_fault())
+ */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+copyout_noerr(const void *kfrom, void *uto, size_t count)
+{}
+
+#else	/* lint */
+
+	ENTRY(copyout_noerr)
+	sethi	%hi(.copyio_noerr), REAL_LOFAULT
+	b	.do_copyout
+	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
+	SET_SIZE(copyout_noerr)
+
+#endif /* lint */
+
+#if defined(lint)
+
+int use_hw_bcopy = 1;
+int use_hw_copyio = 1;
+int use_hw_bzero = 1;
+uint_t hw_copy_limit_1 = 0;
+uint_t hw_copy_limit_2 = 0;
+uint_t hw_copy_limit_4 = 0;
+uint_t hw_copy_limit_8 = 0;
+
+#else /* !lint */
+
+	.align	4
+	DGDEF(use_hw_bcopy)
+	.word	1
+	DGDEF(use_hw_copyio)
+	.word	1
+	DGDEF(use_hw_bzero)
+	.word	1
+	DGDEF(hw_copy_limit_1)
+	.word	0
+	DGDEF(hw_copy_limit_2)
+	.word	0
+	DGDEF(hw_copy_limit_4)
+	.word	0
+	DGDEF(hw_copy_limit_8)
+	.word	0
+
+	.align	64
+	.section ".text"
+#endif /* !lint */
+
+
+/*
+ * hwblkclr - clears block-aligned, block-multiple-sized regions that are
+ * longer than 256 bytes in length using spitfire's block stores.  If
+ * the criteria for using this routine are not met then it calls bzero
+ * and returns 1.  Otherwise 0 is returned indicating success.
+ * Caller is responsible for ensuring use_hw_bzero is true and that
+ * kpreempt_disable() has been called.
+ */
+#ifdef lint
+/*ARGSUSED*/
+int
+hwblkclr(void *addr, size_t len)
+{ 
+	return(0);
+}
+#else /* lint */
+	! %i0 - start address
+	! %i1 - length of region (multiple of 64)
+	! %l0 - saved fprs
+	! %l1 - pointer to saved %d0 block
+	! %l2 - saved curthread->t_lwp
+
+	ENTRY(hwblkclr)
+	! get another window w/space for one aligned block of saved fpregs
+	save	%sp, -SA(MINFRAME + 2*64), %sp
+
+	! Must be block-aligned
+	andcc	%i0, (64-1), %g0
+	bnz,pn	%ncc, 1f
+	  nop
+
+	! ... and must be 256 bytes or more
+	cmp	%i1, 256
+	blu,pn	%ncc, 1f
+	  nop
+
+	! ... and length must be a multiple of 64
+	andcc	%i1, (64-1), %g0
+	bz,pn	%ncc, 2f
+	  nop
+
+1:	! punt, call bzero but notify the caller that bzero was used
+	mov	%i0, %o0
+	call	bzero
+	  mov	%i1, %o1
+	ret
+	restore	%g0, 1, %o0	! return (1) - did not use block operations
+
+2:	rd	%fprs, %l0		! check for unused fp
+	btst	FPRS_FEF, %l0
+	bz	1f
+	  nop
+
+	! save in-use fpregs on stack
+	membar	#Sync
+	add	%fp, STACK_BIAS - 65, %l1
+	and	%l1, -64, %l1
+	stda	%d0, [%l1]ASI_BLK_P
+
+1:	membar	#StoreStore|#StoreLoad|#LoadStore
+	wr	%g0, FPRS_FEF, %fprs
+	wr	%g0, ASI_BLK_P, %asi
+
+	! Clear block
+	fzero	%d0
+	fzero	%d2
+	fzero	%d4
+	fzero	%d6
+	fzero	%d8
+	fzero	%d10
+	fzero	%d12
+	fzero	%d14
+
+	mov	256, %i3
+	ba	.pz_doblock
+	  nop
+
+.pz_blkstart:	
+      ! stda	%d0, [%i0+192]%asi  ! in dly slot of branch that got us here
+	stda	%d0, [%i0+128]%asi
+	stda	%d0, [%i0+64]%asi
+	stda	%d0, [%i0]%asi
+.pz_zinst:
+	add	%i0, %i3, %i0
+	sub	%i1, %i3, %i1
+.pz_doblock:
+	cmp	%i1, 256
+	bgeu,a	%ncc, .pz_blkstart
+	  stda	%d0, [%i0+192]%asi
+
+	cmp	%i1, 64
+	blu	%ncc, .pz_finish
+	
+	andn	%i1, (64-1), %i3
+	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
+	set	.pz_zinst, %i4
+	sub	%i4, %i2, %i4
+	jmp	%i4
+	  nop
+
+.pz_finish:
+	membar	#Sync
+	btst	FPRS_FEF, %l0
+	bz,a	.pz_finished
+	  wr	%l0, 0, %fprs		! restore fprs
+
+	! restore fpregs from stack
+	ldda	[%l1]ASI_BLK_P, %d0
+	membar	#Sync
+	wr	%l0, 0, %fprs		! restore fprs
+
+.pz_finished:
+	ret
+	restore	%g0, 0, %o0		! return (bzero or not)
+	SET_SIZE(hwblkclr)
+#endif	/* lint */
+
+#ifdef	lint
+/* Copy 32 bytes of data from src to dst using physical addresses */
+/*ARGSUSED*/
+void
+hw_pa_bcopy32(uint64_t src, uint64_t dst)
+{}
+#else	/*!lint */
+
+	/*
+	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
+	 * using physical addresses.
+	 */
+	ENTRY_NP(hw_pa_bcopy32)
+	rdpr    %pstate, %g1
+	andn    %g1, PSTATE_IE, %g2
+	wrpr    %g0, %g2, %pstate
+
+	ldxa    [%o0]ASI_MEM, %o2
+	add     %o0, 8, %o0
+	ldxa    [%o0]ASI_MEM, %o3
+	add     %o0, 8, %o0
+	ldxa    [%o0]ASI_MEM, %o4
+	add     %o0, 8, %o0
+	ldxa    [%o0]ASI_MEM, %o5
+	stxa    %o2, [%o1]ASI_MEM
+	add     %o1, 8, %o1
+	stxa    %o3, [%o1]ASI_MEM
+	add     %o1, 8, %o1
+	stxa    %o4, [%o1]ASI_MEM
+	add     %o1, 8, %o1
+	stxa    %o5, [%o1]ASI_MEM
+
+	membar	#Sync
+	retl
+	  wrpr    %g0, %g1, %pstate
+	SET_SIZE(hw_pa_bcopy32)
+#endif /* lint */
diff --git a/usr/src/uts/sun4u/cpu/spitfire_kdi.c b/usr/src/uts/sun4u/cpu/spitfire_kdi.c
new file mode 100644
index 0000000000..c9097fc8b6
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/spitfire_kdi.c
@@ -0,0 +1,152 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * CPU-specific functions needed by the Kernel-Debugger Interface (KDI).  These
+ * functions are invoked directly by the kernel debugger (kmdb) while the system
+ * has been stopped, and as such must not use any kernel facilities that block
+ * or otherwise rely on forward progress by other parts of the kernel.
+ *
+ * These functions may also be called before unix`_start, and as such cannot
+ * use any kernel facilities that must be initialized as part of system start.
+ * An example of such a facility is drv_usecwait(), which relies on a parameter
+ * that is initialized by the unix module.  As a result, drv_usecwait() may not
+ * be used by KDI functions.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/archsystm.h>
+#include <sys/machsystm.h>
+#include <sys/cpu_module.h>
+#include <sys/spitregs.h>
+#include <sys/xc_impl.h>
+#include <sys/intreg.h>
+#include <sys/kdi_impl.h>
+
+/*
+ * We keep our own copies, used for cache flushing, because we can be called
+ * before cpu_fiximpl().
+ */
+static int kdi_dcache_size;
+static int kdi_dcache_linesize;
+static int kdi_icache_size;
+static int kdi_icache_linesize;
+
+/*
+ * Assembly support for spitfire modules in spitfire_asm.s
+ */
+extern int idsr_busy(void);
+extern void init_mondo_nocheck(xcfunc_t *func, uint64_t arg1, uint64_t arg2);
+extern void shipit(int);
+extern void kdi_flush_idcache(int, int, int, int);
+
+static int
+kdi_cpu_ready_iter(int (*cb)(int, void *), void *arg)
+{
+	int rc, i;
+
+	for (rc = 0, i = 0; i < NCPU; i++) {
+		if (CPU_IN_SET(cpu_ready_set, i))
+			rc += cb(i, arg);
+	}
+
+	return (rc);
+}
+
+/*
+ * Sends a cross-call to a specified processor.  The caller assumes
+ * responsibility for repetition of cross-calls, as appropriate (MARSA for
+ * debugging).
+ */
+static int
+kdi_xc_one(int cpuid, void (*func)(uintptr_t, uintptr_t), uintptr_t arg1,
+    uintptr_t arg2)
+{
+	uint64_t idsr;
+
+	/*
+	 * if (idsr_busy())
+	 *	return (KDI_XC_RES_ERR);
+	 */
+
+	init_mondo_nocheck((xcfunc_t *)func, arg1, arg2);
+
+	shipit(CPUID_TO_UPAID(cpuid));
+
+	if ((idsr = getidsr()) == 0)
+		return (KDI_XC_RES_OK);
+	else if (idsr & IDSR_BUSY)
+		return (KDI_XC_RES_BUSY);
+	else
+		return (KDI_XC_RES_NACK);
+}
+
+static void
+kdi_tickwait(clock_t nticks)
+{
+	clock_t endtick = gettick() + nticks;
+
+	while (gettick() < endtick);
+}
+
+static void
+kdi_cpu_init(int dcache_size, int dcache_linesize, int icache_size,
+    int icache_linesize)
+{
+	kdi_dcache_size = dcache_size;
+	kdi_dcache_linesize = dcache_linesize;
+	kdi_icache_size = icache_size;
+	kdi_icache_linesize = icache_linesize;
+}
+
+/* used directly by kdi_read/write_phys */
+void
+kdi_flush_caches(void)
+{
+	kdi_flush_idcache(kdi_dcache_size, kdi_dcache_linesize,
+	    kdi_icache_size, kdi_icache_linesize);
+}
+
+/*ARGSUSED*/
+int
+kdi_get_stick(uint64_t *stickp)
+{
+	return (-1);
+}
+
+void
+cpu_kdi_init(kdi_t *kdi)
+{
+	kdi->kdi_flush_caches = kdi_flush_caches;
+	kdi->mkdi_cpu_init = kdi_cpu_init;
+	kdi->mkdi_cpu_ready_iter = kdi_cpu_ready_iter;
+	kdi->mkdi_xc_one = kdi_xc_one;
+	kdi->mkdi_tickwait = kdi_tickwait;
+	kdi->mkdi_get_stick = kdi_get_stick;
+}
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetah.c b/usr/src/uts/sun4u/cpu/us3_cheetah.c
new file mode 100644
index 0000000000..6ff125f311
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_cheetah.c
@@ -0,0 +1,731 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/ddi.h>
+#include <sys/sysmacros.h>
+#include <sys/archsystm.h>
+#include <sys/vmsystm.h>
+#include <sys/machparam.h>
+#include <sys/machsystm.h>
+#include <sys/machthread.h>
+#include <sys/cpu.h>
+#include <sys/cmp.h>
+#include <sys/elf_SPARC.h>
+#include <vm/hat_sfmmu.h>
+#include <vm/seg_kmem.h>
+#include <sys/cpuvar.h>
+#include <sys/cheetahregs.h>
+#include <sys/us3_module.h>
+#include <sys/async.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dditypes.h>
+#include <sys/prom_debug.h>
+#include <sys/prom_plat.h>
+#include <sys/cpu_module.h>
+#include <sys/sysmacros.h>
+#include <sys/intreg.h>
+#include <sys/clock.h>
+#include <sys/platform_module.h>
+#include <sys/machtrap.h>
+#include <sys/ontrap.h>
+#include <sys/panic.h>
+#include <sys/memlist.h>
+#include <sys/bootconf.h>
+#include <sys/ivintr.h>
+#include <sys/atomic.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/cpu/UltraSPARC-III.h>
+#include <vm/vm_dep.h>
+
+#ifdef	CHEETAHPLUS_ERRATUM_25
+#include <sys/cyclic.h>
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+
+/*
+ * Setup trap handlers.
+ */
+void
+cpu_init_trap(void)
+{
+	CH_SET_TRAP(tt_pil15, ch_pil15_interrupt_instr);
+
+	CH_SET_TRAP(tt0_fecc, fecc_err_instr);
+	CH_SET_TRAP(tt1_fecc, fecc_err_tl1_instr);
+	CH_SET_TRAP(tt1_swtrap0, fecc_err_tl1_cont_instr);
+}
+
+static int
+getintprop(dnode_t node, char *name, int deflt)
+{
+	int	value;
+
+	switch (prom_getproplen(node, name)) {
+	case sizeof (int):
+		(void) prom_getprop(node, name, (caddr_t)&value);
+		break;
+
+	default:
+		value = deflt;
+		break;
+	}
+
+	return (value);
+}
+
+/*
+ * Set the magic constants of the implementation.
+ */
+/*ARGSUSED*/
+void
+cpu_fiximp(dnode_t dnode)
+{
+	int i, a;
+
+	static struct {
+		char	*name;
+		int	*var;
+		int	defval;
+	} prop[] = {
+		"dcache-size", &dcache_size, CH_DCACHE_SIZE,
+		"dcache-line-size", &dcache_linesize, CH_DCACHE_LSIZE,
+		"icache-size", &icache_size, CH_ICACHE_SIZE,
+		"icache-line-size", &icache_linesize, CH_ICACHE_LSIZE,
+		"ecache-size", &ecache_size, CH_ECACHE_MAX_SIZE,
+		"ecache-line-size", &ecache_alignsize, CH_ECACHE_MAX_LSIZE,
+		"ecache-associativity", &ecache_associativity, CH_ECACHE_NWAY
+	};
+
+	extern int exec_lpg_disable, use_brk_lpg, use_stk_lpg, use_zmap_lpg;
+
+
+	for (i = 0; i < sizeof (prop) / sizeof (prop[0]); i++)
+		*prop[i].var = getintprop(dnode, prop[i].name, prop[i].defval);
+
+	ecache_setsize = ecache_size / ecache_associativity;
+
+	vac_size = CH_VAC_SIZE;
+	vac_mask = MMU_PAGEMASK & (vac_size - 1);
+	i = 0; a = vac_size;
+	while (a >>= 1)
+		++i;
+	vac_shift = i;
+	shm_alignment = vac_size;
+	vac = 1;
+
+	/*
+	 * Cheetah's large page support has problems with large numbers of
+	 * large pages, so just disable large pages out-of-the-box.
+	 */
+	exec_lpg_disable = 1;
+	use_brk_lpg = 0;
+	use_stk_lpg = 0;
+	use_zmap_lpg = 0;
+}
+
+void
+send_mondo_set(cpuset_t set)
+{
+	int lo, busy, nack, shipped = 0;
+	uint16_t i, cpuids[IDSR_BN_SETS];
+	uint64_t idsr, nackmask = 0, busymask, curnack, curbusy;
+	uint64_t starttick, endtick, tick, lasttick;
+#if (NCPU > IDSR_BN_SETS)
+	int index = 0;
+	int ncpuids = 0;
+#endif
+#ifdef	CHEETAHPLUS_ERRATUM_25
+	int recovered = 0;
+	int cpuid;
+#endif
+
+	ASSERT(!CPUSET_ISNULL(set));
+	starttick = lasttick = gettick();
+
+#if (NCPU <= IDSR_BN_SETS)
+	for (i = 0; i < NCPU; i++)
+		if (CPU_IN_SET(set, i)) {
+			shipit(i, shipped);
+			nackmask |= IDSR_NACK_BIT(shipped);
+			cpuids[shipped++] = i;
+			CPUSET_DEL(set, i);
+			if (CPUSET_ISNULL(set))
+				break;
+		}
+	CPU_STATS_ADDQ(CPU, sys, xcalls, shipped);
+#else
+	for (i = 0; i < NCPU; i++)
+		if (CPU_IN_SET(set, i)) {
+			ncpuids++;
+
+			/*
+			 * Ship only to the first (IDSR_BN_SETS) CPUs.  If we
+			 * find we have shipped to more than (IDSR_BN_SETS)
+			 * CPUs, set "index" to the highest numbered CPU in
+			 * the set so we can ship to other CPUs a bit later on.
+			 */
+			if (shipped < IDSR_BN_SETS) {
+				shipit(i, shipped);
+				nackmask |= IDSR_NACK_BIT(shipped);
+				cpuids[shipped++] = i;
+				CPUSET_DEL(set, i);
+				if (CPUSET_ISNULL(set))
+					break;
+			} else
+				index = (int)i;
+		}
+
+	CPU_STATS_ADDQ(CPU, sys, xcalls, ncpuids);
+#endif
+
+	busymask = IDSR_NACK_TO_BUSY(nackmask);
+	busy = nack = 0;
+	endtick = starttick + xc_tick_limit;
+	for (;;) {
+		idsr = getidsr();
+#if (NCPU <= IDSR_BN_SETS)
+		if (idsr == 0)
+			break;
+#else
+		if (idsr == 0 && shipped == ncpuids)
+			break;
+#endif
+		tick = gettick();
+		/*
+		 * If there is a big jump between the current tick
+		 * count and lasttick, we have probably hit a break
+		 * point.  Adjust endtick accordingly to avoid panic.
+		 */
+		if (tick > (lasttick + xc_tick_jump_limit))
+			endtick += (tick - lasttick);
+		lasttick = tick;
+		if (tick > endtick) {
+			if (panic_quiesce)
+				return;
+#ifdef	CHEETAHPLUS_ERRATUM_25
+			cpuid = -1;
+			for (i = 0; i < IDSR_BN_SETS; i++) {
+				if (idsr & (IDSR_NACK_BIT(i) |
+				    IDSR_BUSY_BIT(i))) {
+					cpuid = cpuids[i];
+					break;
+				}
+			}
+			if (cheetah_sendmondo_recover && cpuid != -1 &&
+			    recovered == 0) {
+				if (mondo_recover(cpuid, i)) {
+					/*
+					 * We claimed the whole memory or
+					 * full scan is disabled.
+					 */
+					recovered++;
+				}
+				tick = gettick();
+				endtick = tick + xc_tick_limit;
+				lasttick = tick;
+				/*
+				 * Recheck idsr
+				 */
+				continue;
+			} else
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+			{
+				cmn_err(CE_CONT, "send mondo timeout "
+				    "[%d NACK %d BUSY]\nIDSR 0x%"
+				    "" PRIx64 "  cpuids:", nack, busy, idsr);
+				for (i = 0; i < IDSR_BN_SETS; i++) {
+					if (idsr & (IDSR_NACK_BIT(i) |
+					    IDSR_BUSY_BIT(i))) {
+						cmn_err(CE_CONT, " 0x%x",
+						    cpuids[i]);
+					}
+				}
+				cmn_err(CE_CONT, "\n");
+				cmn_err(CE_PANIC, "send_mondo_set: timeout");
+			}
+		}
+		curnack = idsr & nackmask;
+		curbusy = idsr & busymask;
+#if (NCPU > IDSR_BN_SETS)
+		if (shipped < ncpuids) {
+			uint64_t cpus_left;
+			uint16_t next = (uint16_t)index;
+
+			cpus_left = ~(IDSR_NACK_TO_BUSY(curnack) | curbusy) &
+				busymask;
+
+			if (cpus_left) {
+				do {
+					/*
+					 * Sequence through and ship to the
+					 * remainder of the CPUs in the system
+					 * (e.g. other than the first
+					 * (IDSR_BN_SETS)) in reverse order.
+					 */
+					lo = lowbit(cpus_left) - 1;
+					i = IDSR_BUSY_IDX(lo);
+					shipit(next, i);
+					shipped++;
+					cpuids[i] = next;
+
+					/*
+					 * If we've processed all the CPUs,
+					 * exit the loop now and save
+					 * instructions.
+					 */
+					if (shipped == ncpuids)
+						break;
+
+					for ((index = ((int)next - 1));
+					    index >= 0; index--)
+						if (CPU_IN_SET(set, index)) {
+							next = (uint16_t)index;
+							break;
+						}
+
+					cpus_left &= ~(1ull << lo);
+				} while (cpus_left);
+#ifdef	CHEETAHPLUS_ERRATUM_25
+				/*
+				 * Clear recovered because we are sending to
+				 * a new set of targets.
+				 */
+				recovered = 0;
+#endif
+				continue;
+			}
+		}
+#endif
+		if (curbusy) {
+			busy++;
+			continue;
+		}
+
+#ifdef SEND_MONDO_STATS
+		{
+			int n = gettick() - starttick;
+			if (n < 8192)
+				x_nack_stimes[n >> 7]++;
+		}
+#endif
+		while (gettick() < (tick + sys_clock_mhz))
+			;
+		do {
+			lo = lowbit(curnack) - 1;
+			i = IDSR_NACK_IDX(lo);
+			shipit(cpuids[i], i);
+			curnack &= ~(1ull << lo);
+		} while (curnack);
+		nack++;
+		busy = 0;
+	}
+#ifdef SEND_MONDO_STATS
+	{
+		int n = gettick() - starttick;
+		if (n < 8192)
+			x_set_stimes[n >> 7]++;
+		else
+			x_set_ltimes[(n >> 13) & 0xf]++;
+	}
+	x_set_cpus[shipped]++;
+#endif
+}
+
+/*
+ * Handles error logging for implementation specific error types.
+ */
+/*ARGSUSED*/
+int
+cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep)
+{
+	/* There aren't any error types which are specific to cheetah only */
+	return (CH_ASYNC_LOG_UNKNOWN);
+}
+
+/*
+ * Figure out if Ecache is direct-mapped (Cheetah or Cheetah+ with Ecache
+ * control ECCR_ASSOC bit off or 2-way (Cheetah+ with ECCR_ASSOC on).
+ * We need to do this on the fly because we may have mixed Cheetah+'s with
+ * both direct and 2-way Ecaches.
+ */
+int
+cpu_ecache_nway(void)
+{
+	return (CH_ECACHE_NWAY);
+}
+
+/*
+ * Note that these are entered into the table: Fatal Errors (PERR, IERR,
+ * ISAP, EMU) first, orphaned UCU/UCC, AFAR Overwrite policy, finally IVU, IVC.
+ * Afar overwrite policy is:
+ *   UCU,UCC > UE,EDU,WDU,CPU > CE,EDC,EMC,WDC,CPC > TO,BERR
+ */
+ecc_type_to_info_t ecc_type_to_info[] = {
+
+	/* Fatal Errors */
+	C_AFSR_PERR,	"PERR ",	ECC_ALL_TRAPS,	CPU_FATAL,
+		"PERR Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM2,
+		FM_EREPORT_CPU_USIII_PERR,
+	C_AFSR_IERR,	"IERR ", 	ECC_ALL_TRAPS,	CPU_FATAL,
+		"IERR Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM2,
+		FM_EREPORT_CPU_USIII_IERR,
+	C_AFSR_ISAP,	"ISAP ",	ECC_ALL_TRAPS,	CPU_FATAL,
+		"ISAP Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_ISAP,
+	C_AFSR_EMU,	"EMU ",		ECC_ASYNC_TRAPS, CPU_FATAL,
+		"EMU Fatal",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_EMU,
+
+	/* Orphaned UCC/UCU Errors */
+	C_AFSR_UCU,	"OUCU ",	ECC_ORPH_TRAPS, CPU_ORPH,
+		"Orphaned UCU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCU,
+	C_AFSR_UCC,	"OUCC ",	ECC_ORPH_TRAPS, CPU_ORPH,
+		"Orphaned UCC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCC,
+
+	/* UCU, UCC */
+	C_AFSR_UCU,	"UCU ",		ECC_F_TRAP,	CPU_UE_ECACHE,
+		"UCU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCU,
+	C_AFSR_UCC,	"UCC ",		ECC_F_TRAP,	CPU_CE_ECACHE,
+		"UCC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCC,
+
+	/* UE, EDU:ST, EDU:BLD, WDU, CPU */
+	C_AFSR_UE,	"UE ",		ECC_ASYNC_TRAPS, CPU_UE,
+		"Uncorrectable system bus (UE)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_UE,
+	C_AFSR_EDU,	"EDU ",		ECC_C_TRAP,	CPU_UE_ECACHE_RETIRE,
+		"EDU:ST",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDUST,
+	C_AFSR_EDU,	"EDU ",		ECC_D_TRAP,	CPU_UE_ECACHE_RETIRE,
+		"EDU:BLD",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDUBL,
+	C_AFSR_WDU,	"WDU ",		ECC_C_TRAP,	CPU_UE_ECACHE_RETIRE,
+		"WDU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_WDU,
+	C_AFSR_CPU,	"CPU ",		ECC_C_TRAP,	CPU_UE_ECACHE,
+		"CPU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_CPU,
+
+	/* CE, EDC, EMC, WDC, CPC */
+	C_AFSR_CE,	"CE ",		ECC_C_TRAP,	CPU_CE,
+		"Corrected system bus (CE)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_CE,
+	C_AFSR_EDC,	"EDC ",		ECC_C_TRAP,	CPU_CE_ECACHE,
+		"EDC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDC,
+	C_AFSR_EMC,	"EMC ",		ECC_C_TRAP,	CPU_EMC,
+		"EMC",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_EMC,
+	C_AFSR_WDC,	"WDC ",		ECC_C_TRAP,	CPU_CE_ECACHE,
+		"WDC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_WDC,
+	C_AFSR_CPC,	"CPC ",		ECC_C_TRAP,	CPU_CE_ECACHE,
+		"CPC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_CPC,
+
+	/* TO, BERR */
+	C_AFSR_TO,	"TO ",		ECC_ASYNC_TRAPS, CPU_TO,
+		"Timeout (TO)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_TO,
+	C_AFSR_BERR,	"BERR ",	ECC_ASYNC_TRAPS, CPU_BERR,
+		"Bus Error (BERR)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_BERR,
+
+	/* IVU, IVC */
+	C_AFSR_IVU,	"IVU ",		ECC_C_TRAP,	CPU_IV,
+		"IVU",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_IVU,
+	C_AFSR_IVC,	"IVC ",		ECC_C_TRAP,	CPU_IV,
+		"IVC",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_IVC,
+
+	0,		NULL,		0,		0,
+		NULL,
+		FM_EREPORT_PAYLOAD_UNKNOWN,
+		FM_EREPORT_CPU_USIII_UNKNOWN,
+};
+
+/*
+ * Prioritized list of Error bits for AFAR overwrite.
+ * See Cheetah PRM P.6.1
+ *   Class 4:  UCC, UCU
+ *   Class 3:  UE, EDU, EMU, WDU, CPU
+ *   Class 2:  CE, EDC, EMC, WDC, CPC
+ *   Class 1:  TO, BERR
+ */
+uint64_t afar_overwrite[] = {
+	C_AFSR_UCC | C_AFSR_UCU,
+	C_AFSR_UE | C_AFSR_EDU | C_AFSR_EMU | C_AFSR_WDU | C_AFSR_CPU,
+	C_AFSR_CE | C_AFSR_EDC | C_AFSR_EMC | C_AFSR_WDC | C_AFSR_CPC,
+	C_AFSR_TO | C_AFSR_BERR,
+	0
+};
+
+/*
+ * Prioritized list of Error bits for ESYND overwrite.
+ * See Cheetah PRM P.6.2
+ *   Class 2:  UE, IVU, EDU, WDU, UCU, CPU
+ *   Class 1:  CE, IVC, EDC, WDC, UCC, CPC
+ */
+uint64_t esynd_overwrite[] = {
+	C_AFSR_UE | C_AFSR_IVU | C_AFSR_EDU | C_AFSR_WDU | C_AFSR_UCU |
+	    C_AFSR_CPU,
+	C_AFSR_CE | C_AFSR_IVC | C_AFSR_EDC | C_AFSR_WDC | C_AFSR_UCC |
+	    C_AFSR_CPC,
+	0
+};
+
+/*
+ * Prioritized list of Error bits for MSYND overwrite.
+ * See Cheetah PRM P.6.3
+ *   Class 2:  EMU
+ *   Class 1:  EMC
+ */
+uint64_t msynd_overwrite[] = {
+	C_AFSR_EMU,
+	C_AFSR_EMC,
+	0
+};
+
+/*
+ * change cpu speed bits -- new speed will be normal-speed/divisor.
+ *
+ * The Jalapeno memory controllers are required to drain outstanding
+ * memory transactions within 32 JBus clocks in order to be ready
+ * to enter Estar mode.  In some corner cases however, that time
+ * fell short.
+ *
+ * A safe software solution is to force MCU to act like in Estar mode,
+ * then delay 1us (in ppm code) prior to assert J_CHNG_L signal.
+ * To reverse the effect, upon exiting Estar, software restores the
+ * MCU to its original state.
+ */
+/* ARGSUSED1 */
+void
+cpu_change_speed(uint64_t divisor, uint64_t arg2)
+{
+	bus_config_eclk_t *bceclk;
+	uint64_t		reg;
+
+	for (bceclk = bus_config_eclk; bceclk->divisor; bceclk++) {
+		if (bceclk->divisor != divisor)
+			continue;
+		reg = get_safari_config();
+		reg &= ~SAFARI_CONFIG_ECLK_MASK;
+		reg |= bceclk->mask;
+		set_safari_config(reg);
+		CPU->cpu_m.divisor = (uchar_t)divisor;
+		return;
+	}
+	/*
+	 * We will reach here only if OBP and kernel don't agree on
+	 * the speeds supported by the CPU.
+	 */
+	cmn_err(CE_WARN, "cpu_change_speed: bad divisor %" PRIu64, divisor);
+}
+
+/*
+ * Cpu private initialization.  This includes allocating the cpu_private
+ * data structure, initializing it, and initializing the scrubber for this
+ * cpu.  This function calls cpu_init_ecache_scrub_dr to init the scrubber.
+ * We use kmem_cache_create for the cheetah private data structure because
+ * it needs to be allocated on a PAGESIZE (8192) byte boundary.
+ */
+void
+cpu_init_private(struct cpu *cp)
+{
+	cheetah_private_t *chprp;
+	int i;
+
+	ASSERT(CPU_PRIVATE(cp) == NULL);
+
+	/* LINTED: E_TRUE_LOGICAL_EXPR */
+	ASSERT((offsetof(cheetah_private_t, chpr_tl1_err_data) +
+	    sizeof (ch_err_tl1_data_t) * CH_ERR_TL1_TLMAX) <= PAGESIZE);
+
+	/*
+	 * Running with a Cheetah+, Jaguar, or Panther on a Cheetah CPU
+	 * machine is not a supported configuration. Attempting to do so
+	 * may result in unpredictable failures (e.g. running Cheetah+
+	 * CPUs with Cheetah E$ disp flush) so don't allow it.
+	 *
+	 * This is just defensive code since this configuration mismatch
+	 * should have been caught prior to OS execution.
+	 */
+	if (!IS_CHEETAH(cpunodes[cp->cpu_id].implementation)) {
+		cmn_err(CE_PANIC, "CPU%d: UltraSPARC-III+/IV/IV+ not"
+		    " supported on UltraSPARC-III code\n", cp->cpu_id);
+	}
+
+	/*
+	 * If the ch_private_cache has not been created, create it.
+	 */
+	if (ch_private_cache == NULL) {
+		ch_private_cache = kmem_cache_create("ch_private_cache",
+		    sizeof (cheetah_private_t), PAGESIZE, NULL, NULL,
+		    NULL, NULL, static_arena, 0);
+	}
+
+	chprp = CPU_PRIVATE(cp) = kmem_cache_alloc(ch_private_cache, KM_SLEEP);
+
+	bzero(chprp, sizeof (cheetah_private_t));
+	chprp->chpr_fecctl0_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	chprp->chpr_cecc_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	chprp->chpr_async_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	for (i = 0; i < CH_ERR_TL1_TLMAX; i++)
+		chprp->chpr_tl1_err_data[i].ch_err_tl1_logout.clo_data.chd_afar
+		    = LOGOUT_INVALID;
+
+	chprp->chpr_icache_size = CH_ICACHE_SIZE;
+	chprp->chpr_icache_linesize = CH_ICACHE_LSIZE;
+
+	cpu_init_ecache_scrub_dr(cp);
+
+	chprp->chpr_ec_set_size = cpunodes[cp->cpu_id].ecache_size /
+	    cpu_ecache_nway();
+
+	adjust_hw_copy_limits(cpunodes[cp->cpu_id].ecache_size);
+	ch_err_tl1_paddrs[cp->cpu_id] = va_to_pa(chprp);
+	ASSERT(ch_err_tl1_paddrs[cp->cpu_id] != -1);
+}
+
+/*
+ * Clear the error state registers for this CPU.
+ * For Cheetah, just clear the AFSR
+ */
+void
+set_cpu_error_state(ch_cpu_errors_t *cpu_error_regs)
+{
+	set_asyncflt(cpu_error_regs->afsr & ~C_AFSR_FATAL_ERRS);
+}
+
+/*
+ * For Cheetah, the error recovery code uses an alternate flush area in the
+ * TL>0 fast ECC handler.  ecache_tl1_flushaddr is the physical address of
+ * this exclusive displacement flush area.
+ */
+uint64_t ecache_tl1_flushaddr = (uint64_t)-1; /* physaddr for E$ flushing */
+
+/*
+ * Allocate and initialize the exclusive displacement flush area.
+ * Must be called before startup_bop_gone().
+ */
+caddr_t
+ecache_init_scrub_flush_area(caddr_t alloc_base)
+{
+	unsigned size = 2 * CH_ECACHE_8M_SIZE;
+	caddr_t tmp_alloc_base = alloc_base;
+	caddr_t flush_alloc_base =
+	    (caddr_t)roundup((uintptr_t)alloc_base, size);
+	caddr_t ecache_tl1_virtaddr;
+
+	/*
+	 * Allocate the physical memory for the exclusive flush area
+	 *
+	 * Need to allocate an exclusive flush area that is twice the
+	 * largest supported E$ size, physically contiguous, and
+	 * aligned on twice the largest E$ size boundary.
+	 *
+	 * Memory allocated via BOP_ALLOC is included in the "cage"
+	 * from the DR perspective and due to this, its physical
+	 * address will never change and the memory will not be
+	 * removed.
+	 *
+	 * BOP_ALLOC takes 4 arguments: bootops, virtual address hint,
+	 * size of the area to allocate, and alignment of the area to
+	 * allocate. It returns zero if the allocation fails, or the
+	 * virtual address for a successful allocation. Memory BOP_ALLOC'd
+	 * is physically contiguous.
+	 */
+	if ((ecache_tl1_virtaddr = (caddr_t)BOP_ALLOC(bootops,
+	    flush_alloc_base, size, size)) != NULL) {
+
+		tmp_alloc_base =
+		    (caddr_t)roundup((uintptr_t)(ecache_tl1_virtaddr + size),
+		    ecache_alignsize);
+
+		/*
+		 * get the physical address of the exclusive flush area
+		 */
+		ecache_tl1_flushaddr = va_to_pa(ecache_tl1_virtaddr);
+
+	} else {
+		ecache_tl1_virtaddr = (caddr_t)-1;
+		cmn_err(CE_NOTE, "!ecache_init_scrub_flush_area failed\n");
+	}
+
+	return (tmp_alloc_base);
+}
+
+/*
+ * Update cpu_offline_set so the scrubber knows which cpus are offline
+ */
+/*ARGSUSED*/
+int
+cpu_scrub_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
+{
+	switch (what) {
+	case CPU_ON:
+	case CPU_INIT:
+		CPUSET_DEL(cpu_offline_set, cpuid);
+		break;
+	case CPU_OFF:
+		CPUSET_ADD(cpu_offline_set, cpuid);
+		break;
+	default:
+		break;
+	}
+	return (0);
+}
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetah_asm.s b/usr/src/uts/sun4u/cpu/us3_cheetah_asm.s
new file mode 100644
index 0000000000..4efb1d5b38
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_cheetah_asm.s
@@ -0,0 +1,456 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Assembly code support for the Cheetah module
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* lint */
+
+#include <sys/asm_linkage.h>
+#include <sys/mmu.h>
+#include <vm/hat_sfmmu.h>
+#include <sys/machparam.h>
+#include <sys/machcpuvar.h>
+#include <sys/machthread.h>
+#include <sys/machtrap.h>
+#include <sys/privregs.h>
+#include <sys/asm_linkage.h>
+#include <sys/trap.h>
+#include <sys/cheetahregs.h>
+#include <sys/us3_module.h>
+#include <sys/xc_impl.h>
+#include <sys/intreg.h>
+#include <sys/async.h>
+#include <sys/clock.h>
+#include <sys/cheetahasm.h>
+
+#ifdef TRAPTRACE
+#include <sys/traptrace.h>
+#endif /* TRAPTRACE */
+
+#if !defined(lint)
+
+/* BEGIN CSTYLED */
+
+/*
+ * Cheetah version to flush an Ecache line by index (aliased address)
+ */
+#define	ECACHE_REFLUSH_LINE(ecache_size, alias_address, scr2)		\
+	ldxa	[alias_address]ASI_MEM, %g0
+
+#define	ECACHE_FLUSH_LINE(physaddr, ecache_size, scr1, scr2)		\
+	xor	physaddr, ecache_size, scr1;				\
+	add	ecache_size, ecache_size, scr2;				\
+	sub	scr2, 1, scr2;						\
+	and	scr1, scr2, scr1;					\
+	ASM_LDX(scr2, ecache_flushaddr);				\
+	add	scr1, scr2, scr1;					\
+	ECACHE_REFLUSH_LINE(ecache_size, scr1, scr2)
+
+/* END CSTYLED */
+
+#endif	/* !lint */
+
+
+/*
+ * Fast ECC error at TL>0 handler
+ * We get here via trap 70 at TL>0->Software trap 0 at TL>0.  We enter
+ * this routine with %g1 and %g2 already saved in %tpc, %tnpc and %tstate.
+ * For a complete description of the Fast ECC at TL>0 handling see the
+ * comment block "Cheetah/Cheetah+ Fast ECC at TL>0 trap strategy" in
+ * us3_common_asm.s
+ */
+#if defined(lint)
+
+void
+fast_ecc_tl1_err(void)
+{}
+
+#else	/* lint */
+
+	.section ".text"
+	.align	64
+	ENTRY_NP(fast_ecc_tl1_err)
+
+	/*
+	 * This macro turns off the D$/I$ if they are on and saves their
+	 * original state in ch_err_tl1_tmp, saves all the %g registers in the
+	 * ch_err_tl1_data structure, updates the ch_err_tl1_flags and saves
+	 * the %tpc in ch_err_tl1_tpc.  At the end of this macro, %g1 will
+	 * point to the ch_err_tl1_data structure and the original D$/I$ state
+	 * will be saved in ch_err_tl1_tmp.  All %g registers except for %g1
+	 * will be available.
+	 */
+	CH_ERR_TL1_FECC_ENTER;
+
+	/*
+	 * Get the diagnostic logout data.  %g4 must be initialized to
+	 * current CEEN state, %g5 must point to logout structure in
+	 * ch_err_tl1_data_t.  %g3 will contain the nesting count upon
+	 * return.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g4
+	and	%g4, EN_REG_CEEN, %g4
+	add	%g1, CH_ERR_TL1_LOGOUT, %g5
+	DO_TL1_CPU_LOGOUT(%g3, %g2, %g4, %g5, %g6, %g3, %g4)
+
+	/*
+	 * If the logout nesting count is exceeded, we're probably
+	 * not making any progress, try to panic instead.
+	 */
+	cmp	%g3, CLO_NESTING_MAX
+	bge	fecc_tl1_err
+	  nop
+
+	/*
+	 * Save the current CEEN and NCEEN state in %g7 and turn them off
+	 * before flushing the Ecache.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g7
+	andn	%g7, EN_REG_CEEN | EN_REG_NCEEN, %g5
+	stxa	%g5, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/*
+	 * Flush the Ecache, using the largest possible cache size with the
+	 * smallest possible line size since we can't get the actual sizes
+	 * from the cpu_node due to DTLB misses.
+	 */
+	set	CH_ECACHE_8M_SIZE, %g4
+	set	CH_ECACHE_MIN_LSIZE, %g5
+
+	/*
+	 * Use a different flush address to avoid recursion if the error
+	 * exists in ecache_flushaddr.
+	 */
+	ASM_LDX(%g6, ecache_tl1_flushaddr)
+	cmp	%g6, -1		! check if address is valid
+	be	%xcc, fecc_tl1_err
+	  nop
+	CH_ECACHE_FLUSHALL(%g4, %g5, %g6)
+
+	/*
+	 * Restore CEEN and NCEEN to the previous state.
+	 */
+	stxa	%g7, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/*
+	 * If we turned off the D$, then flush it and turn it back on.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_TMP]%asi, %g3
+	andcc	%g3, CH_ERR_TSTATE_DC_ON, %g0
+	bz	%xcc, 3f
+	  nop
+
+	/*
+	 * Flush the D$.
+	 */
+	ASM_LD(%g4, dcache_size)
+	ASM_LD(%g5, dcache_linesize)
+	CH_DCACHE_FLUSHALL(%g4, %g5, %g6)
+
+	/*
+	 * Turn the D$ back on.
+	 */
+	ldxa	[%g0]ASI_DCU, %g3
+	or	%g3, DCU_DC, %g3
+	stxa	%g3, [%g0]ASI_DCU
+	membar	#Sync
+3:
+	/*
+	 * If we turned off the I$, then flush it and turn it back on.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_TMP]%asi, %g3
+	andcc	%g3, CH_ERR_TSTATE_IC_ON, %g0
+	bz	%xcc, 4f
+	  nop
+
+	/*
+	 * Flush the I$.
+	 */
+	ASM_LD(%g4, icache_size)
+	ASM_LD(%g5, icache_linesize)
+	CH_ICACHE_FLUSHALL(%g4, %g5, %g6, %g3)
+
+	/*
+	 * Turn the I$ back on.  Changing DCU_IC requires flush.
+	 */
+	ldxa	[%g0]ASI_DCU, %g3
+	or	%g3, DCU_IC, %g3
+	stxa	%g3, [%g0]ASI_DCU
+	flush	%g0	
+4:
+
+#ifdef TRAPTRACE
+	/*
+	 * Get current trap trace entry physical pointer.
+	 */
+	CPU_INDEX(%g6, %g5)
+	sll	%g6, TRAPTR_SIZE_SHIFT, %g6
+	set	trap_trace_ctl, %g5
+	add	%g6, %g5, %g6
+	ld	[%g6 + TRAPTR_LIMIT], %g5
+	tst	%g5
+	be	%icc, skip_traptrace
+	  nop
+	ldx	[%g6 + TRAPTR_PBASE], %g5
+	ld	[%g6 + TRAPTR_OFFSET], %g4
+	add	%g5, %g4, %g5
+
+	/*
+	 * Create trap trace entry.
+	 */
+	rd	%asi, %g7
+	wr	%g0, TRAPTR_ASI, %asi
+	rd	STICK, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TICK]%asi
+	rdpr	%tl, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TL]%asi
+	rdpr	%tt, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TT]%asi
+	rdpr	%tpc, %g4
+	stna	%g4, [%g5 + TRAP_ENT_TPC]%asi
+	rdpr	%tstate, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TSTATE]%asi
+	stna	%sp, [%g5 + TRAP_ENT_SP]%asi
+	stna	%g0, [%g5 + TRAP_ENT_TR]%asi
+	wr	%g0, %g7, %asi
+	ldxa	[%g1 + CH_ERR_TL1_SDW_AFAR]%asi, %g3
+	ldxa	[%g1 + CH_ERR_TL1_SDW_AFSR]%asi, %g4
+	wr	%g0, TRAPTR_ASI, %asi
+	stna	%g3, [%g5 + TRAP_ENT_F1]%asi
+	stna	%g4, [%g5 + TRAP_ENT_F2]%asi
+	wr	%g0, %g7, %asi
+	ldxa	[%g1 + CH_ERR_TL1_AFAR]%asi, %g3
+	ldxa	[%g1 + CH_ERR_TL1_AFSR]%asi, %g4
+	wr	%g0, TRAPTR_ASI, %asi
+	stna	%g3, [%g5 + TRAP_ENT_F3]%asi
+	stna	%g4, [%g5 + TRAP_ENT_F4]%asi
+	wr	%g0, %g7, %asi
+
+	/*
+	 * Advance trap trace pointer.
+	 */
+	ld	[%g6 + TRAPTR_OFFSET], %g5
+	ld	[%g6 + TRAPTR_LIMIT], %g4
+	st	%g5, [%g6 + TRAPTR_LAST_OFFSET]
+	add	%g5, TRAP_ENT_SIZE, %g5
+	sub	%g4, TRAP_ENT_SIZE, %g4
+	cmp	%g5, %g4
+	movge	%icc, 0, %g5
+	st	%g5, [%g6 + TRAPTR_OFFSET]
+skip_traptrace:
+#endif	/* TRAPTRACE */
+
+	/*
+	 * If nesting count is not zero, skip all the AFSR/AFAR
+	 * handling and just do the necessary cache-flushing.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_NEST_CNT]%asi, %g2
+	brnz	%g2, 6f
+	  nop
+
+	/*
+	 * If a UCU followed by a WDU has occurred go ahead and panic
+	 * since a UE will occur (on the retry) before the UCU and WDU
+	 * messages are enqueued.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_AFSR]%asi, %g3
+	set	1, %g4
+	sllx	%g4, C_AFSR_UCU_SHIFT, %g4
+	btst	%g4, %g3		! UCU in original AFSR?
+	bz	%xcc, 6f
+	  nop
+	ldxa	[%g0]ASI_AFSR, %g4	! current AFSR
+	or	%g3, %g4, %g3		! %g3 = original + current AFSR
+	set	1, %g4
+	sllx	%g4, C_AFSR_WDU_SHIFT, %g4
+	btst	%g4, %g3		! WDU in original or current AFSR?
+	bnz	%xcc, fecc_tl1_err
+	  nop
+
+6:
+	/*
+	 * We fall into this macro if we've successfully logged the error in
+	 * the ch_err_tl1_data structure and want the PIL15 softint to pick
+	 * it up and log it.  %g1 must point to the ch_err_tl1_data structure.
+	 * Restores the %g registers and issues retry.
+	 */
+	CH_ERR_TL1_EXIT;
+
+	/*
+	 * Establish panic exit label.
+	 */
+	CH_ERR_TL1_PANIC_EXIT(fecc_tl1_err);
+
+	SET_SIZE(fast_ecc_tl1_err)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * scrubphys - Pass in the aligned physical memory address
+ * that you want to scrub, along with the ecache set size.
+ *
+ *	1) Displacement flush the E$ line corresponding to %addr.
+ *	   The first ldxa guarantees that the %addr is no longer in
+ *	   M, O, or E (goes to I or S (if instruction fetch also happens).
+ *	2) "Write" the data using a CAS %addr,%g0,%g0.
+ *	   The casxa guarantees a transition from I to M or S to M.
+ *	3) Displacement flush the E$ line corresponding to %addr.
+ *	   The second ldxa pushes the M line out of the ecache, into the
+ *	   writeback buffers, on the way to memory.
+ *	4) The "membar #Sync" pushes the cache line out of the writeback
+ *	   buffers onto the bus, on the way to dram finally.
+ *
+ * This is a modified version of the algorithm suggested by Gary Lauterbach.
+ * In theory the CAS %addr,%g0,%g0 is supposed to mark the addr's cache line
+ * as modified, but then we found out that for spitfire, if it misses in the
+ * E$ it will probably install as an M, but if it hits in the E$, then it
+ * will stay E, if the store doesn't happen. So the first displacement flush
+ * should ensure that the CAS will miss in the E$.  Arrgh.
+ */
+/* ARGSUSED */
+void
+scrubphys(uint64_t paddr, int ecache_set_size)
+{}
+
+#else	/* lint */
+	ENTRY(scrubphys)
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate	! clear IE, AM bits
+
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	ECACHE_REFLUSH_LINE(%o1, %o2, %o3)
+
+	wrpr	%g0, %o4, %pstate	! restore earlier pstate register value
+
+	retl
+	membar	#Sync			! move the data out of the load buffer
+	SET_SIZE(scrubphys)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * clearphys - Pass in the aligned physical memory address
+ * that you want to push out, as a ecache_linesize byte block of zeros,
+ * from the ecache zero-filled.
+ */
+/* ARGSUSED */
+void
+clearphys(uint64_t paddr, int ecache_set_size, int ecache_linesize)
+{
+}
+
+#else	/* lint */
+	ENTRY(clearphys)
+	/* turn off IE, AM bits */
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate
+
+	/* turn off NCEEN */
+	ldxa	[%g0]ASI_ESTATE_ERR, %o5
+	andn	%o5, EN_REG_NCEEN, %o3
+	stxa	%o3, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/* zero the E$ line */
+1:
+	subcc	%o2, 8, %o2
+	bge	1b
+	  stxa	%g0, [%o0 + %o2]ASI_MEM
+
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	ECACHE_REFLUSH_LINE(%o1, %o2, %o3)
+
+	/* clear the AFSR */
+	ldxa	[%g0]ASI_AFSR, %o1
+	stxa	%o1, [%g0]ASI_AFSR
+	membar	#Sync
+
+	/* turn NCEEN back on */
+	stxa	%o5, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/* return and re-enable IE and AM */
+	retl
+	  wrpr	%g0, %o4, %pstate
+	SET_SIZE(clearphys)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * Cheetah Ecache displacement flush the specified line from the E$
+ *
+ * Register usage:
+ *	%o0 - 64 bit physical address for flushing
+ *	%o1 - Ecache set size
+ */
+/*ARGSUSED*/
+void
+ecache_flush_line(uint64_t flushaddr, int ec_set_size)
+{
+}
+#else	/* lint */
+	ENTRY(ecache_flush_line)
+
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+
+	retl
+	  nop
+	SET_SIZE(ecache_flush_line)
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * This routine will not be called in Cheetah systems.
+ */
+void
+flush_ipb(void)
+{ return; }
+
+#else	/* lint */
+
+	ENTRY(flush_ipb)
+	retl
+	nop
+	SET_SIZE(flush_ipb)
+
+#endif	/* lint */
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetahplus.c b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
new file mode 100644
index 0000000000..c77addfade
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
@@ -0,0 +1,1317 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/ddi.h>
+#include <sys/sysmacros.h>
+#include <sys/archsystm.h>
+#include <sys/vmsystm.h>
+#include <sys/machparam.h>
+#include <sys/machsystm.h>
+#include <sys/machthread.h>
+#include <sys/cpu.h>
+#include <sys/cmp.h>
+#include <sys/elf_SPARC.h>
+#include <vm/hat_sfmmu.h>
+#include <vm/seg_kmem.h>
+#include <sys/cpuvar.h>
+#include <sys/cheetahregs.h>
+#include <sys/us3_module.h>
+#include <sys/async.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dditypes.h>
+#include <sys/prom_debug.h>
+#include <sys/prom_plat.h>
+#include <sys/cpu_module.h>
+#include <sys/sysmacros.h>
+#include <sys/intreg.h>
+#include <sys/clock.h>
+#include <sys/platform_module.h>
+#include <sys/machtrap.h>
+#include <sys/ontrap.h>
+#include <sys/panic.h>
+#include <sys/memlist.h>
+#include <sys/bootconf.h>
+#include <sys/ivintr.h>
+#include <sys/atomic.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/cpu/UltraSPARC-III.h>
+#include <sys/fm/util.h>
+
+#ifdef	CHEETAHPLUS_ERRATUM_25
+#include <sys/cyclic.h>
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+
+/*
+ * See comment above cpu_scrub_cpu_setup() for description
+ */
+#define	SCRUBBER_NEITHER_CORE_ONLINE	0x0
+#define	SCRUBBER_CORE_0_ONLINE		0x1
+#define	SCRUBBER_CORE_1_ONLINE		0x2
+#define	SCRUBBER_BOTH_CORES_ONLINE	(SCRUBBER_CORE_0_ONLINE | \
+					SCRUBBER_CORE_1_ONLINE)
+
+static int pn_matching_valid_l2_line(uint64_t faddr, ch_ec_data_t *clo_l2_data);
+static void cpu_async_log_tlb_parity_err(void *flt);
+static cpu_t *cpu_get_sibling_core(cpu_t *cpup);
+
+
+/*
+ * Setup trap handlers.
+ */
+void
+cpu_init_trap(void)
+{
+	CH_SET_TRAP(tt_pil15, ch_pil15_interrupt_instr);
+
+	CH_SET_TRAP(tt0_fecc, fecc_err_instr);
+	CH_SET_TRAP(tt1_fecc, fecc_err_tl1_instr);
+	CH_SET_TRAP(tt1_swtrap0, fecc_err_tl1_cont_instr);
+
+	CH_SET_TRAP(tt0_dperr, dcache_parity_instr);
+	CH_SET_TRAP(tt1_dperr, dcache_parity_tl1_instr);
+	CH_SET_TRAP(tt1_swtrap1, dcache_parity_tl1_cont_instr);
+
+	CH_SET_TRAP(tt0_iperr, icache_parity_instr);
+	CH_SET_TRAP(tt1_iperr, icache_parity_tl1_instr);
+	CH_SET_TRAP(tt1_swtrap2, icache_parity_tl1_cont_instr);
+}
+
+/*
+ * Set the magic constants of the implementation.
+ */
+/*ARGSUSED*/
+void
+cpu_fiximp(dnode_t dnode)
+{
+	int i, a;
+	extern int vac_size, vac_shift;
+	extern uint_t vac_mask;
+
+	dcache_size = CH_DCACHE_SIZE;
+	dcache_linesize = CH_DCACHE_LSIZE;
+
+	icache_size = CHP_ICACHE_MAX_SIZE;
+	icache_linesize = CHP_ICACHE_MIN_LSIZE;
+
+	ecache_size = CH_ECACHE_MAX_SIZE;
+	ecache_alignsize = CH_ECACHE_MAX_LSIZE;
+	ecache_associativity = CHP_ECACHE_MIN_NWAY;
+
+	/*
+	 * ecache_setsize needs to maximum of all cpu ecache setsizes
+	 */
+	ecache_setsize = CHP_ECACHE_MAX_SETSIZE;
+	ASSERT(ecache_setsize >= (ecache_size / ecache_associativity));
+
+	vac_size = CH_VAC_SIZE;
+	vac_mask = MMU_PAGEMASK & (vac_size - 1);
+	i = 0; a = vac_size;
+	while (a >>= 1)
+		++i;
+	vac_shift = i;
+	shm_alignment = vac_size;
+	vac = 1;
+}
+
+void
+send_mondo_set(cpuset_t set)
+{
+	int lo, busy, nack, shipped = 0;
+	uint16_t i, cpuids[IDSR_BN_SETS];
+	uint64_t idsr, nackmask = 0, busymask, curnack, curbusy;
+	uint64_t starttick, endtick, tick, lasttick;
+#if (NCPU > IDSR_BN_SETS)
+	int index = 0;
+	int ncpuids = 0;
+#endif
+#ifdef	CHEETAHPLUS_ERRATUM_25
+	int recovered = 0;
+	int cpuid;
+#endif
+
+	ASSERT(!CPUSET_ISNULL(set));
+	starttick = lasttick = gettick();
+
+#if (NCPU <= IDSR_BN_SETS)
+	for (i = 0; i < NCPU; i++)
+		if (CPU_IN_SET(set, i)) {
+			shipit(i, shipped);
+			nackmask |= IDSR_NACK_BIT(shipped);
+			cpuids[shipped++] = i;
+			CPUSET_DEL(set, i);
+			if (CPUSET_ISNULL(set))
+				break;
+		}
+	CPU_STATS_ADDQ(CPU, sys, xcalls, shipped);
+#else
+	for (i = 0; i < NCPU; i++)
+		if (CPU_IN_SET(set, i)) {
+			ncpuids++;
+
+			/*
+			 * Ship only to the first (IDSR_BN_SETS) CPUs.  If we
+			 * find we have shipped to more than (IDSR_BN_SETS)
+			 * CPUs, set "index" to the highest numbered CPU in
+			 * the set so we can ship to other CPUs a bit later on.
+			 */
+			if (shipped < IDSR_BN_SETS) {
+				shipit(i, shipped);
+				nackmask |= IDSR_NACK_BIT(shipped);
+				cpuids[shipped++] = i;
+				CPUSET_DEL(set, i);
+				if (CPUSET_ISNULL(set))
+					break;
+			} else
+				index = (int)i;
+		}
+
+	CPU_STATS_ADDQ(CPU, sys, xcalls, ncpuids);
+#endif
+
+	busymask = IDSR_NACK_TO_BUSY(nackmask);
+	busy = nack = 0;
+	endtick = starttick + xc_tick_limit;
+	for (;;) {
+		idsr = getidsr();
+#if (NCPU <= IDSR_BN_SETS)
+		if (idsr == 0)
+			break;
+#else
+		if (idsr == 0 && shipped == ncpuids)
+			break;
+#endif
+		tick = gettick();
+		/*
+		 * If there is a big jump between the current tick
+		 * count and lasttick, we have probably hit a break
+		 * point.  Adjust endtick accordingly to avoid panic.
+		 */
+		if (tick > (lasttick + xc_tick_jump_limit))
+			endtick += (tick - lasttick);
+		lasttick = tick;
+		if (tick > endtick) {
+			if (panic_quiesce)
+				return;
+#ifdef	CHEETAHPLUS_ERRATUM_25
+			cpuid = -1;
+			for (i = 0; i < IDSR_BN_SETS; i++) {
+				if (idsr & (IDSR_NACK_BIT(i) |
+				    IDSR_BUSY_BIT(i))) {
+					cpuid = cpuids[i];
+					break;
+				}
+			}
+			if (cheetah_sendmondo_recover && cpuid != -1 &&
+			    recovered == 0) {
+				if (mondo_recover(cpuid, i)) {
+					/*
+					 * We claimed the whole memory or
+					 * full scan is disabled.
+					 */
+					recovered++;
+				}
+				tick = gettick();
+				endtick = tick + xc_tick_limit;
+				lasttick = tick;
+				/*
+				 * Recheck idsr
+				 */
+				continue;
+			} else
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+			{
+				cmn_err(CE_CONT, "send mondo timeout "
+				    "[%d NACK %d BUSY]\nIDSR 0x%"
+				    "" PRIx64 "  cpuids:", nack, busy, idsr);
+				for (i = 0; i < IDSR_BN_SETS; i++) {
+					if (idsr & (IDSR_NACK_BIT(i) |
+					    IDSR_BUSY_BIT(i))) {
+						cmn_err(CE_CONT, " 0x%x",
+						    cpuids[i]);
+					}
+				}
+				cmn_err(CE_CONT, "\n");
+				cmn_err(CE_PANIC, "send_mondo_set: timeout");
+			}
+		}
+		curnack = idsr & nackmask;
+		curbusy = idsr & busymask;
+#if (NCPU > IDSR_BN_SETS)
+		if (shipped < ncpuids) {
+			uint64_t cpus_left;
+			uint16_t next = (uint16_t)index;
+
+			cpus_left = ~(IDSR_NACK_TO_BUSY(curnack) | curbusy) &
+				busymask;
+
+			if (cpus_left) {
+				do {
+					/*
+					 * Sequence through and ship to the
+					 * remainder of the CPUs in the system
+					 * (e.g. other than the first
+					 * (IDSR_BN_SETS)) in reverse order.
+					 */
+					lo = lowbit(cpus_left) - 1;
+					i = IDSR_BUSY_IDX(lo);
+					shipit(next, i);
+					shipped++;
+					cpuids[i] = next;
+
+					/*
+					 * If we've processed all the CPUs,
+					 * exit the loop now and save
+					 * instructions.
+					 */
+					if (shipped == ncpuids)
+						break;
+
+					for ((index = ((int)next - 1));
+					    index >= 0; index--)
+						if (CPU_IN_SET(set, index)) {
+							next = (uint16_t)index;
+							break;
+						}
+
+					cpus_left &= ~(1ull << lo);
+				} while (cpus_left);
+#ifdef	CHEETAHPLUS_ERRATUM_25
+				/*
+				 * Clear recovered because we are sending to
+				 * a new set of targets.
+				 */
+				recovered = 0;
+#endif
+				continue;
+			}
+		}
+#endif
+		if (curbusy) {
+			busy++;
+			continue;
+		}
+
+#ifdef SEND_MONDO_STATS
+		{
+			int n = gettick() - starttick;
+			if (n < 8192)
+				x_nack_stimes[n >> 7]++;
+		}
+#endif
+		while (gettick() < (tick + sys_clock_mhz))
+			;
+		do {
+			lo = lowbit(curnack) - 1;
+			i = IDSR_NACK_IDX(lo);
+			shipit(cpuids[i], i);
+			curnack &= ~(1ull << lo);
+		} while (curnack);
+		nack++;
+		busy = 0;
+	}
+#ifdef SEND_MONDO_STATS
+	{
+		int n = gettick() - starttick;
+		if (n < 8192)
+			x_set_stimes[n >> 7]++;
+		else
+			x_set_ltimes[(n >> 13) & 0xf]++;
+	}
+	x_set_cpus[shipped]++;
+#endif
+}
+
+/*
+ * Handles error logging for implementation specific error types
+ */
+/*ARGSUSED1*/
+int
+cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt;
+	struct async_flt *aflt = (struct async_flt *)flt;
+
+	switch (ch_flt->flt_type) {
+
+	case CPU_IC_PARITY:
+		cpu_async_log_ic_parity_err(flt);
+		return (CH_ASYNC_LOG_DONE);
+
+	case CPU_DC_PARITY:
+		cpu_async_log_dc_parity_err(flt);
+		return (CH_ASYNC_LOG_DONE);
+
+	case CPU_DUE:
+		cpu_log_err(aflt);
+		cpu_page_retire(ch_flt);
+		return (CH_ASYNC_LOG_DONE);
+
+	case CPU_ITLB_PARITY:
+	case CPU_DTLB_PARITY:
+		cpu_async_log_tlb_parity_err(flt);
+		return (CH_ASYNC_LOG_DONE);
+
+	default:
+		return (CH_ASYNC_LOG_UNKNOWN);
+	}
+}
+
+/*
+ * Figure out if Ecache is direct-mapped (Cheetah or Cheetah+ with Ecache
+ * control ECCR_ASSOC bit off or 2-way (Cheetah+ with ECCR_ASSOC on).
+ * We need to do this on the fly because we may have mixed Cheetah+'s with
+ * both direct and 2-way Ecaches. Panther only supports 4-way L3$.
+ */
+int
+cpu_ecache_nway(void)
+{
+	if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation))
+		return (PN_L3_NWAYS);
+	return ((get_ecache_ctrl() & ECCR_ASSOC) ? 2 : 1);
+}
+
+/*
+ * Note that these are entered into the table: Fatal Errors (PERR, IERR, ISAP,
+ * EMU, IMU) first, orphaned UCU/UCC, AFAR Overwrite policy, finally IVU, IVC.
+ * Afar overwrite policy is:
+ *   Class 4:
+ *      AFSR     -- UCC, UCU, TUE, TSCE, TUE_SH
+ *      AFSR_EXT -- L3_UCC, L3_UCU, L3_TUE, L3_TUE_SH
+ *   Class 3:
+ *      AFSR     -- UE, DUE, EDU, WDU, CPU
+ *      AFSR_EXT -- L3_EDU, L3_WDU, L3_CPU
+ *   Class 2:
+ *      AFSR     -- CE, EDC, EMC, WDC, CPC, THCE
+ *      AFSR_EXT -- L3_EDC, L3_WDC, L3_CPC, L3_THCE
+ *   Class 1:
+ *      AFSR     -- TO, DTO, BERR, DBERR
+ */
+ecc_type_to_info_t ecc_type_to_info[] = {
+
+	/* Fatal Errors */
+	C_AFSR_PERR,		"PERR ",	ECC_ALL_TRAPS,
+		CPU_FATAL,	"PERR Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM2,
+		FM_EREPORT_CPU_USIII_PERR,
+	C_AFSR_IERR,		"IERR ", 	ECC_ALL_TRAPS,
+		CPU_FATAL,	"IERR Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM2,
+		FM_EREPORT_CPU_USIII_IERR,
+	C_AFSR_ISAP,		"ISAP ",	ECC_ALL_TRAPS,
+		CPU_FATAL,	"ISAP Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_ISAP,
+	C_AFSR_L3_TUE_SH,	"L3_TUE_SH ", 	ECC_C_TRAP,
+		CPU_FATAL,	"L3_TUE_SH Fatal",
+		FM_EREPORT_PAYLOAD_L3_TAG_ECC,
+		FM_EREPORT_CPU_USIII_L3_TUE_SH,
+	C_AFSR_L3_TUE,		"L3_TUE ", 	ECC_C_TRAP,
+		CPU_FATAL,	"L3_TUE Fatal",
+		FM_EREPORT_PAYLOAD_L3_TAG_ECC,
+		FM_EREPORT_CPU_USIII_L3_TUE,
+	C_AFSR_TUE_SH,		"TUE_SH ", 	ECC_C_TRAP,
+		CPU_FATAL,	"TUE_SH Fatal",
+		FM_EREPORT_PAYLOAD_L2_TAG_ECC,
+		FM_EREPORT_CPU_USIII_TUE_SH,
+	C_AFSR_TUE,		"TUE ", 	ECC_ALL_TRAPS,
+		CPU_FATAL,	"TUE Fatal",
+		FM_EREPORT_PAYLOAD_L2_TAG_ECC,
+		FM_EREPORT_CPU_USIII_TUE,
+	C_AFSR_EMU,		"EMU ",		ECC_ASYNC_TRAPS,
+		CPU_FATAL,	"EMU Fatal",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_EMU,
+	C_AFSR_IMU,		"IMU ",		ECC_C_TRAP,
+		CPU_FATAL,	"IMU Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_IMU,
+
+	/* L3$ Address parity errors are reported via the MECC bit */
+	C_AFSR_L3_MECC,		"L3_MECC ",	ECC_MECC_TRAPS,
+		CPU_L3_ADDR_PE,	"L3 Address Parity",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_MECC,
+
+	/* Orphaned UCC/UCU Errors */
+	C_AFSR_L3_UCU,		"L3_OUCU ",	ECC_ORPH_TRAPS,
+		CPU_ORPH,	"Orphaned L3_UCU",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_UCU,
+	C_AFSR_L3_UCC,		"L3_OUCC ",	ECC_ORPH_TRAPS,
+		CPU_ORPH,	"Orphaned L3_UCC",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_UCC,
+	C_AFSR_UCU,		"OUCU ",	ECC_ORPH_TRAPS,
+		CPU_ORPH,	"Orphaned UCU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCU,
+	C_AFSR_UCC,		"OUCC ",	ECC_ORPH_TRAPS,
+		CPU_ORPH,	"Orphaned UCC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCC,
+
+	/* UCU, UCC */
+	C_AFSR_L3_UCU,		"L3_UCU ",	ECC_F_TRAP,
+		CPU_UE_ECACHE,	"L3_UCU",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_UCU,
+	C_AFSR_L3_UCC,		"L3_UCC ",	ECC_F_TRAP,
+		CPU_CE_ECACHE,	"L3_UCC",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_UCC,
+	C_AFSR_UCU,		"UCU ",		ECC_F_TRAP,
+		CPU_UE_ECACHE,	"UCU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCU,
+	C_AFSR_UCC,		"UCC ",		ECC_F_TRAP,
+		CPU_CE_ECACHE,	"UCC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCC,
+	C_AFSR_TSCE,		"TSCE ",	ECC_F_TRAP,
+		CPU_CE_ECACHE,	"TSCE",
+		FM_EREPORT_PAYLOAD_L2_TAG_ECC,
+		FM_EREPORT_CPU_USIII_TSCE,
+
+	/* UE, EDU:ST, EDU:BLD, WDU, CPU */
+	C_AFSR_UE,		"UE ",		ECC_ASYNC_TRAPS,
+		CPU_UE,		"Uncorrectable system bus (UE)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_UE,
+	C_AFSR_L3_EDU,		"L3_EDU ",	ECC_C_TRAP,
+		CPU_UE_ECACHE_RETIRE,	"L3_EDU:ST",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_EDUST,
+	C_AFSR_L3_EDU,		"L3_EDU ",	ECC_D_TRAP,
+		CPU_UE_ECACHE_RETIRE,	"L3_EDU:BLD",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_EDUBL,
+	C_AFSR_L3_WDU,		"L3_WDU ",	ECC_C_TRAP,
+		CPU_UE_ECACHE_RETIRE,	"L3_WDU",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_WDU,
+	C_AFSR_L3_CPU,		"L3_CPU ",	ECC_C_TRAP,
+		CPU_UE_ECACHE,	"L3_CPU",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_CPU,
+	C_AFSR_EDU,		"EDU ",		ECC_C_TRAP,
+		CPU_UE_ECACHE_RETIRE,	"EDU:ST",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDUST,
+	C_AFSR_EDU,		"EDU ",		ECC_D_TRAP,
+		CPU_UE_ECACHE_RETIRE,	"EDU:BLD",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDUBL,
+	C_AFSR_WDU,		"WDU ",		ECC_C_TRAP,
+		CPU_UE_ECACHE_RETIRE,	"WDU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_WDU,
+	C_AFSR_CPU,		"CPU ",		ECC_C_TRAP,
+		CPU_UE_ECACHE,	"CPU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_CPU,
+	C_AFSR_DUE,		"DUE ",		ECC_C_TRAP,
+		CPU_DUE,	"DUE",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_DUE,
+
+	/* CE, EDC, EMC, WDC, CPC */
+	C_AFSR_CE,		"CE ",		ECC_C_TRAP,
+		CPU_CE,		"Corrected system bus (CE)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_CE,
+	C_AFSR_L3_EDC,		"L3_EDC ",	ECC_C_TRAP,
+		CPU_CE_ECACHE,	"L3_EDC",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_EDC,
+	C_AFSR_EDC,		"EDC ",		ECC_C_TRAP,
+		CPU_CE_ECACHE,	"EDC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDC,
+	C_AFSR_EMC,		"EMC ",		ECC_C_TRAP,
+		CPU_EMC,	"EMC",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_EMC,
+	C_AFSR_L3_WDC,		"L3_WDC ",	ECC_C_TRAP,
+		CPU_CE_ECACHE,	"L3_WDC",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_WDC,
+	C_AFSR_L3_CPC,		"L3_CPC ",	ECC_C_TRAP,
+		CPU_CE_ECACHE,	"L3_CPC",
+		FM_EREPORT_PAYLOAD_L3_DATA,
+		FM_EREPORT_CPU_USIII_L3_CPC,
+	C_AFSR_L3_THCE,		"L3_THCE ",	ECC_C_TRAP,
+		CPU_CE_ECACHE,	"L3_THCE",
+		FM_EREPORT_PAYLOAD_L3_TAG_ECC,
+		FM_EREPORT_CPU_USIII_L3_THCE,
+	C_AFSR_WDC,		"WDC ",		ECC_C_TRAP,
+		CPU_CE_ECACHE,	"WDC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_WDC,
+	C_AFSR_CPC,		"CPC ",		ECC_C_TRAP,
+		CPU_CE_ECACHE,	"CPC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_CPC,
+	C_AFSR_THCE,		"THCE ",	ECC_C_TRAP,
+		CPU_CE_ECACHE,	"THCE",
+		FM_EREPORT_PAYLOAD_L2_TAG_ECC,
+		FM_EREPORT_CPU_USIII_THCE,
+
+	/* TO, BERR */
+	C_AFSR_TO,		"TO ",		ECC_ASYNC_TRAPS,
+		CPU_TO,		"Timeout (TO)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_TO,
+	C_AFSR_BERR,		"BERR ",	ECC_ASYNC_TRAPS,
+		CPU_BERR,	"Bus Error (BERR)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_BERR,
+	C_AFSR_DTO,		"DTO ",		ECC_C_TRAP,
+		CPU_TO,		"Disrupting Timeout (DTO)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_DTO,
+	C_AFSR_DBERR,		"DBERR ",	ECC_C_TRAP,
+		CPU_BERR,	"Disrupting Bus Error (DBERR)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_DBERR,
+
+	/* IVU, IVC, IMC */
+	C_AFSR_IVU,		"IVU ",		ECC_C_TRAP,
+		CPU_IV,		"IVU",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_IVU,
+	C_AFSR_IVC,		"IVC ",		ECC_C_TRAP,
+		CPU_IV,		"IVC",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_IVC,
+	C_AFSR_IMC,		"IMC ",		ECC_C_TRAP,
+		CPU_IV,		"IMC",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_IMC,
+
+	0,			NULL,		0,
+		0,		NULL,
+		FM_EREPORT_PAYLOAD_UNKNOWN,
+		FM_EREPORT_CPU_USIII_UNKNOWN,
+};
+
+/*
+ * See Cheetah+ Delta PRM 10.9 and section P.6.1 of the Panther PRM
+ *   Class 4:
+ *      AFSR     -- UCC, UCU, TUE, TSCE, TUE_SH
+ *      AFSR_EXT -- L3_UCC, L3_UCU, L3_TUE, L3_TUE_SH
+ *   Class 3:
+ *      AFSR     -- UE, DUE, EDU, EMU, WDU, CPU
+ *      AFSR_EXT -- L3_EDU, L3_WDU, L3_CPU
+ *   Class 2:
+ *      AFSR     -- CE, EDC, EMC, WDC, CPC, THCE
+ *      AFSR_EXT -- L3_EDC, L3_WDC, L3_CPC, L3_THCE
+ *   Class 1:
+ *      AFSR     -- TO, DTO, BERR, DBERR
+ *      AFSR_EXT --
+ */
+uint64_t afar_overwrite[] = {
+	/* class 4: */
+	C_AFSR_UCC | C_AFSR_UCU | C_AFSR_TUE | C_AFSR_TSCE | C_AFSR_TUE_SH |
+	C_AFSR_L3_UCC | C_AFSR_L3_UCU | C_AFSR_L3_TUE | C_AFSR_L3_TUE_SH,
+	/* class 3: */
+	C_AFSR_UE | C_AFSR_DUE | C_AFSR_EDU | C_AFSR_EMU | C_AFSR_WDU |
+	C_AFSR_CPU | C_AFSR_L3_EDU | C_AFSR_L3_WDU | C_AFSR_L3_CPU,
+	/* class 2: */
+	C_AFSR_CE | C_AFSR_EDC | C_AFSR_EMC | C_AFSR_WDC | C_AFSR_CPC |
+	C_AFSR_THCE | C_AFSR_L3_EDC | C_AFSR_L3_WDC | C_AFSR_L3_CPC |
+	C_AFSR_L3_THCE,
+	/* class 1: */
+	C_AFSR_TO | C_AFSR_DTO | C_AFSR_BERR | C_AFSR_DBERR,
+
+	0
+};
+
+/*
+ * See Cheetah+ Delta PRM 10.9.
+ *   Class 2:  UE, DUE, IVU, EDU, WDU, UCU, CPU
+ *   Class 1:  CE, IVC, EDC, WDC, UCC, CPC
+ */
+uint64_t esynd_overwrite[] = {
+	/* class 2: */
+	C_AFSR_UE | C_AFSR_DUE | C_AFSR_IVU | C_AFSR_EDU | C_AFSR_WDU |
+	    C_AFSR_UCU | C_AFSR_CPU,
+	/* class 1: */
+	C_AFSR_CE | C_AFSR_IVC | C_AFSR_EDC | C_AFSR_WDC | C_AFSR_UCC |
+	    C_AFSR_CPC,
+	0
+};
+
+/*
+ * In panther, the E_SYND overwrite policy changed a little bit
+ * by adding one more level.
+ *   class 3:
+ *      AFSR     -- UCU, UCC
+ *      AFSR_EXT -- L3_UCU, L3_UCC
+ *   Class 2:
+ *      AFSR     -- UE, DUE, IVU, EDU, WDU, CPU
+ *      AFSR_EXT -- L3_EDU, L3_WDU, L3_CPU
+ *   Class 1:
+ *      AFSR     -- CE, IVC, EDC, WDC, CPC
+ *      AFSR_EXT -- L3_EDC, L3_WDC, L3_CPC
+ */
+uint64_t pn_esynd_overwrite[] = {
+	/* class 3: */
+	C_AFSR_UCU | C_AFSR_UCC |
+	C_AFSR_L3_UCU | C_AFSR_L3_UCC,
+	/* class 2: */
+	C_AFSR_UE | C_AFSR_DUE | C_AFSR_IVU | C_AFSR_EDU | C_AFSR_WDU |
+	    C_AFSR_CPU |
+	C_AFSR_L3_EDU | C_AFSR_L3_WDU | C_AFSR_L3_CPU,
+	/* class 1: */
+	C_AFSR_CE | C_AFSR_IVC | C_AFSR_EDC | C_AFSR_WDC | C_AFSR_CPC |
+	C_AFSR_L3_EDC | C_AFSR_L3_WDC | C_AFSR_L3_CPC,
+
+	0
+};
+
+int
+afsr_to_pn_esynd_status(uint64_t afsr, uint64_t afsr_bit)
+{
+	return (afsr_to_overw_status(afsr, afsr_bit, pn_esynd_overwrite));
+}
+
+/*
+ * Prioritized list of Error bits for MSYND overwrite.
+ * See Cheetah PRM P.6.3
+ *   Class 2:  EMU
+ *   Class 1:  EMC
+ *
+ * Panther adds IMU and IMC.
+ */
+uint64_t msynd_overwrite[] = {
+	/* class 2: */
+	C_AFSR_EMU | C_AFSR_IMU,
+	/* class 1: */
+	C_AFSR_EMC | C_AFSR_IMC,
+
+	0
+};
+
+/*
+ * change cpu speed bits -- new speed will be normal-speed/divisor.
+ *
+ * The Jalapeno memory controllers are required to drain outstanding
+ * memory transactions within 32 JBus clocks in order to be ready
+ * to enter Estar mode.  In some corner cases however, that time
+ * fell short.
+ *
+ * A safe software solution is to force MCU to act like in Estar mode,
+ * then delay 1us (in ppm code) prior to assert J_CHNG_L signal.
+ * To reverse the effect, upon exiting Estar, software restores the
+ * MCU to its original state.
+ */
+/* ARGSUSED1 */
+void
+cpu_change_speed(uint64_t divisor, uint64_t arg2)
+{
+	bus_config_eclk_t *bceclk;
+	uint64_t		reg;
+
+	for (bceclk = bus_config_eclk; bceclk->divisor; bceclk++) {
+		if (bceclk->divisor != divisor)
+			continue;
+		reg = get_safari_config();
+		reg &= ~SAFARI_CONFIG_ECLK_MASK;
+		reg |= bceclk->mask;
+		set_safari_config(reg);
+		CPU->cpu_m.divisor = (uchar_t)divisor;
+		return;
+	}
+	/*
+	 * We will reach here only if OBP and kernel don't agree on
+	 * the speeds supported by the CPU.
+	 */
+	cmn_err(CE_WARN, "cpu_change_speed: bad divisor %" PRIu64, divisor);
+}
+
+/*
+ * Cpu private initialization.  This includes allocating the cpu_private
+ * data structure, initializing it, and initializing the scrubber for this
+ * cpu.  This function calls cpu_init_ecache_scrub_dr to init the scrubber.
+ * We use kmem_cache_create for the cheetah private data structure because
+ * it needs to be allocated on a PAGESIZE (8192) byte boundary.
+ */
+void
+cpu_init_private(struct cpu *cp)
+{
+	cheetah_private_t *chprp;
+	int i;
+
+	ASSERT(CPU_PRIVATE(cp) == NULL);
+
+	/* LINTED: E_TRUE_LOGICAL_EXPR */
+	ASSERT((offsetof(cheetah_private_t, chpr_tl1_err_data) +
+	    sizeof (ch_err_tl1_data_t) * CH_ERR_TL1_TLMAX) <= PAGESIZE);
+
+	/*
+	 * Running with Cheetah CPUs in a Cheetah+, Jaguar, Panther or
+	 * mixed Cheetah+/Jaguar/Panther machine is not a supported
+	 * configuration. Attempting to do so may result in unpredictable
+	 * failures (e.g. running Cheetah+ CPUs with Cheetah E$ disp flush)
+	 * so don't allow it.
+	 *
+	 * This is just defensive code since this configuration mismatch
+	 * should have been caught prior to OS execution.
+	 */
+	if (!(IS_CHEETAH_PLUS(cpunodes[cp->cpu_id].implementation) ||
+	    IS_JAGUAR(cpunodes[cp->cpu_id].implementation) ||
+	    IS_PANTHER(cpunodes[cp->cpu_id].implementation))) {
+		cmn_err(CE_PANIC, "CPU%d: UltraSPARC-III not supported"
+		    " on UltraSPARC-III+/IV/IV+ code\n", cp->cpu_id);
+	}
+
+	/*
+	 * If the ch_private_cache has not been created, create it.
+	 */
+	if (ch_private_cache == NULL) {
+		ch_private_cache = kmem_cache_create("ch_private_cache",
+		    sizeof (cheetah_private_t), PAGESIZE, NULL, NULL,
+		    NULL, NULL, static_arena, 0);
+	}
+
+	chprp = CPU_PRIVATE(cp) = kmem_cache_alloc(ch_private_cache, KM_SLEEP);
+
+	bzero(chprp, sizeof (cheetah_private_t));
+	chprp->chpr_fecctl0_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	chprp->chpr_cecc_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	chprp->chpr_async_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	chprp->chpr_tlb_logout.tlo_addr = LOGOUT_INVALID;
+	for (i = 0; i < CH_ERR_TL1_TLMAX; i++)
+		chprp->chpr_tl1_err_data[i].ch_err_tl1_logout.clo_data.chd_afar
+		    = LOGOUT_INVALID;
+
+	/* Panther has a larger Icache compared to cheetahplus or Jaguar */
+	if (IS_PANTHER(cpunodes[cp->cpu_id].implementation)) {
+		chprp->chpr_icache_size = PN_ICACHE_SIZE;
+		chprp->chpr_icache_linesize = PN_ICACHE_LSIZE;
+	} else {
+		chprp->chpr_icache_size = CH_ICACHE_SIZE;
+		chprp->chpr_icache_linesize = CH_ICACHE_LSIZE;
+	}
+
+	cpu_init_ecache_scrub_dr(cp);
+
+	/*
+	 * Panther's L2$ and E$ are shared between cores, so the scrubber is
+	 * only needed on one of the cores.  At this point, we assume all cores
+	 * are online, and we only enable the scrubber on core 0.
+	 */
+	if (IS_PANTHER(cpunodes[cp->cpu_id].implementation)) {
+		chprp->chpr_scrub_misc.chsm_core_state =
+		    SCRUBBER_BOTH_CORES_ONLINE;
+		if (cp->cpu_id != (processorid_t)cmp_cpu_to_chip(cp->cpu_id)) {
+			chprp->chpr_scrub_misc.chsm_enable[
+			    CACHE_SCRUBBER_INFO_E] = 0;
+		}
+	}
+
+	chprp->chpr_ec_set_size = cpunodes[cp->cpu_id].ecache_size /
+	    cpu_ecache_nway();
+
+	adjust_hw_copy_limits(cpunodes[cp->cpu_id].ecache_size);
+	ch_err_tl1_paddrs[cp->cpu_id] = va_to_pa(chprp);
+	ASSERT(ch_err_tl1_paddrs[cp->cpu_id] != -1);
+}
+
+/*
+ * Clear the error state registers for this CPU.
+ * For Cheetah+/Jaguar, just clear the AFSR but
+ * for Panther we also have to clear the AFSR_EXT.
+ */
+void
+set_cpu_error_state(ch_cpu_errors_t *cpu_error_regs)
+{
+	set_asyncflt(cpu_error_regs->afsr & ~C_AFSR_FATAL_ERRS);
+	if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+		set_afsr_ext(cpu_error_regs->afsr_ext & ~C_AFSR_EXT_FATAL_ERRS);
+	}
+}
+
+void
+pn_cpu_log_diag_l2_info(ch_async_flt_t *ch_flt) {
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+	ch_ec_data_t *l2_data = &ch_flt->flt_diag_data.chd_l2_data[0];
+	uint64_t faddr = aflt->flt_addr;
+	uint8_t log_way_mask = 0;
+	int i;
+
+	/*
+	 * Only Panther CPUs have the additional L2$ data that needs
+	 * to be logged here
+	 */
+	if (!IS_PANTHER(cpunodes[aflt->flt_inst].implementation))
+		return;
+
+	/*
+	 * We'll use a simple bit mask to keep track of which way(s)
+	 * of the stored cache line we want to log. The idea is to
+	 * log the entry if it is a valid line and it matches our
+	 * fault AFAR. If no match is found, we will simply log all
+	 * the ways.
+	 */
+	for (i = 0; i < PN_L2_NWAYS; i++)
+		if (pn_matching_valid_l2_line(faddr, &l2_data[i]))
+			log_way_mask |= (1 << i);
+
+	/* If no matching valid lines were found, we log all ways */
+	if (log_way_mask == 0)
+		log_way_mask = (1 << PN_L2_NWAYS) - 1;
+
+	/* Log the cache lines */
+	for (i = 0; i < PN_L2_NWAYS; i++)
+		if (log_way_mask & (1 << i))
+			l2_data[i].ec_logflag = EC_LOGFLAG_MAGIC;
+}
+
+/*
+ * For this routine to return true, the L2 tag in question must be valid
+ * and the tag PA must match the fault address (faddr) assuming the correct
+ * index is being used.
+ */
+static int
+pn_matching_valid_l2_line(uint64_t faddr, ch_ec_data_t *clo_l2_data) {
+	if ((!PN_L2_LINE_INVALID(clo_l2_data->ec_tag)) &&
+	((faddr & P2ALIGN(C_AFAR_PA, PN_L2_SET_SIZE)) ==
+	    PN_L2TAG_TO_PA(clo_l2_data->ec_tag)))
+		return (1);
+	return (0);
+}
+
+/*
+ * This array is used to convert the 3 digit PgSz encoding (as used in
+ * various MMU registers such as MMU_TAG_ACCESS_EXT) into the corresponding
+ * page size.
+ */
+static uint64_t tlb_pgsz_to_size[] = {
+	/* 000 = 8KB: */
+	0x2000,
+	/* 001 = 64KB: */
+	0x10000,
+	/* 010 = 512KB: */
+	0x80000,
+	/* 011 = 4MB: */
+	0x400000,
+	/* 100 = 32MB: */
+	0x2000000,
+	/* 101 = 256MB: */
+	0x10000000,
+	/* undefined for encodings 110 and 111: */
+	0, 0
+};
+
+/*
+ * The itlb_parity_trap and dtlb_parity_trap handlers transfer control here
+ * after collecting logout information related to the TLB parity error and
+ * flushing the offending TTE entries from the ITLB or DTLB.
+ *
+ * DTLB traps which occur at TL>0 are not recoverable because we will most
+ * likely be corrupting some other trap handler's alternate globals. As
+ * such, we simply panic here when that happens. ITLB parity errors are
+ * not expected to happen at TL>0.
+ */
+void
+cpu_tlb_parity_error(struct regs *rp, ulong_t trap_va, ulong_t tlb_info) {
+	ch_async_flt_t ch_flt;
+	struct async_flt *aflt;
+	pn_tlb_logout_t *tlop = NULL;
+	int immu_parity = (tlb_info & PN_TLO_INFO_IMMU) != 0;
+	int tl1_trap = (tlb_info & PN_TLO_INFO_TL1) != 0;
+	char *error_class;
+
+	bzero(&ch_flt, sizeof (ch_async_flt_t));
+
+	/*
+	 * Get the CPU log out info. If we can't find our CPU private
+	 * pointer, or if the logout information does not correspond to
+	 * this error, then we will have to make due without detailed
+	 * logout information.
+	 */
+	if (CPU_PRIVATE(CPU)) {
+		tlop = CPU_PRIVATE_PTR(CPU, chpr_tlb_logout);
+		if ((tlop->tlo_addr != trap_va) ||
+		    (tlop->tlo_info != tlb_info))
+			tlop = NULL;
+	}
+
+	if (tlop) {
+		ch_flt.tlb_diag_data = *tlop;
+
+		/* Zero out + invalidate TLB logout. */
+		bzero(tlop, sizeof (pn_tlb_logout_t));
+		tlop->tlo_addr = LOGOUT_INVALID;
+	} else {
+		/*
+		 * Copy what logout information we have and mark
+		 * it incomplete.
+		 */
+		ch_flt.flt_data_incomplete = 1;
+		ch_flt.tlb_diag_data.tlo_info = tlb_info;
+		ch_flt.tlb_diag_data.tlo_addr = trap_va;
+	}
+
+	/*
+	 * Log the error.
+	 */
+	aflt = (struct async_flt *)&ch_flt;
+	aflt->flt_id = gethrtime_waitfree();
+	aflt->flt_bus_id = getprocessorid();
+	aflt->flt_inst = CPU->cpu_id;
+	aflt->flt_pc = (caddr_t)rp->r_pc;
+	aflt->flt_addr = trap_va;
+	aflt->flt_prot = AFLT_PROT_NONE;
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_priv = (rp->r_tstate & TSTATE_PRIV) ?  1 : 0;
+	aflt->flt_tl = tl1_trap ? 1 : 0;
+	aflt->flt_panic = tl1_trap ? 1 : 0;
+
+	if (immu_parity) {
+		aflt->flt_status = ECC_ITLB_TRAP;
+		ch_flt.flt_type = CPU_ITLB_PARITY;
+		error_class = FM_EREPORT_CPU_USIII_ITLBPE;
+		aflt->flt_payload = FM_EREPORT_PAYLOAD_ITLB_PE;
+	} else {
+		aflt->flt_status = ECC_DTLB_TRAP;
+		ch_flt.flt_type = CPU_DTLB_PARITY;
+		error_class = FM_EREPORT_CPU_USIII_DTLBPE;
+		aflt->flt_payload = FM_EREPORT_PAYLOAD_DTLB_PE;
+	}
+
+	/*
+	 * The TLB entries have already been flushed by the TL1 trap
+	 * handler so at this point the only thing left to do is log
+	 * the error message.
+	 */
+	if (aflt->flt_panic) {
+		cpu_errorq_dispatch(error_class, (void *)&ch_flt,
+		    sizeof (ch_async_flt_t), ue_queue, aflt->flt_panic);
+		/*
+		 * Panic here if aflt->flt_panic has been set.  Enqueued
+		 * errors will be logged as part of the panic flow.
+		 */
+		fm_panic("%sError(s)", immu_parity ? "ITLBPE " : "DTLBPE ");
+	} else {
+		cpu_errorq_dispatch(error_class, (void *)&ch_flt,
+		    sizeof (ch_async_flt_t), ce_queue, aflt->flt_panic);
+	}
+}
+
+/*
+ * This routine is called when a TLB parity error event is 'ue_drain'ed
+ * or 'ce_drain'ed from the errorq.
+ */
+void
+cpu_async_log_tlb_parity_err(void *flt) {
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt;
+	struct async_flt *aflt = (struct async_flt *)flt;
+#ifdef lint
+	aflt = aflt;
+#endif
+
+	/*
+	 * We only capture TLB information if we encountered
+	 * a TLB parity error and Panther is the only CPU which
+	 * can detect a TLB parity error.
+	 */
+	ASSERT(IS_PANTHER(cpunodes[aflt->flt_inst].implementation));
+	ASSERT((ch_flt->flt_type == CPU_ITLB_PARITY) ||
+	    (ch_flt->flt_type == CPU_DTLB_PARITY));
+
+	if (ch_flt->flt_data_incomplete == 0) {
+		if (ch_flt->flt_type == CPU_ITLB_PARITY)
+			ch_flt->tlb_diag_data.tlo_logflag = IT_LOGFLAG_MAGIC;
+		else /* parity error is in DTLB */
+			ch_flt->tlb_diag_data.tlo_logflag = DT_LOGFLAG_MAGIC;
+	}
+}
+
+/*
+ * Add L1 Prefetch cache data to the ereport payload.
+ */
+void
+cpu_payload_add_pcache(struct async_flt *aflt, nvlist_t *nvl)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+	ch_pc_data_t *pcp;
+	ch_pc_data_t pcdata[CH_PCACHE_NWAY];
+	uint_t nelem;
+	int i, ways_logged = 0;
+
+	/*
+	 * We only capture P$ information if we encountered
+	 * a P$ parity error and Panther is the only CPU which
+	 * can detect a P$ parity error.
+	 */
+	ASSERT(IS_PANTHER(cpunodes[aflt->flt_inst].implementation));
+	for (i = 0; i < CH_PCACHE_NWAY; i++) {
+		pcp = &ch_flt->parity_data.dpe.cpl_pc[i];
+		if (pcp->pc_logflag == PC_LOGFLAG_MAGIC) {
+			bcopy(pcp, &pcdata[ways_logged],
+				sizeof (ch_pc_data_t));
+			ways_logged++;
+		}
+	}
+
+	/*
+	 * Add the pcache data to the payload.
+	 */
+	fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L1P_WAYS,
+	    DATA_TYPE_UINT8, (uint8_t)ways_logged, NULL);
+	if (ways_logged != 0) {
+		nelem = sizeof (ch_pc_data_t) / sizeof (uint64_t) * ways_logged;
+		fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L1P_DATA,
+		    DATA_TYPE_UINT64_ARRAY, nelem, (uint64_t *)pcdata, NULL);
+	}
+}
+
+/*
+ * Add TLB diagnostic data to the ereport payload.
+ */
+void
+cpu_payload_add_tlb(struct async_flt *aflt, nvlist_t *nvl)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+	uint8_t num_entries, tlb_data_words;
+
+	/*
+	 * We only capture TLB information if we encountered
+	 * a TLB parity error and Panther is the only CPU which
+	 * can detect a TLB parity error.
+	 */
+	ASSERT(IS_PANTHER(cpunodes[aflt->flt_inst].implementation));
+	ASSERT((ch_flt->flt_type == CPU_ITLB_PARITY) ||
+	    (ch_flt->flt_type == CPU_DTLB_PARITY));
+
+	if (ch_flt->flt_type == CPU_ITLB_PARITY) {
+		num_entries = (uint8_t)(PN_ITLB_NWAYS * PN_NUM_512_ITLBS);
+		tlb_data_words = sizeof (ch_tte_entry_t) / sizeof (uint64_t) *
+		    num_entries;
+
+		/*
+		 * Add the TLB diagnostic data to the payload
+		 * if it was collected.
+		 */
+		if (ch_flt->tlb_diag_data.tlo_logflag == IT_LOGFLAG_MAGIC) {
+			fm_payload_set(nvl,
+			    FM_EREPORT_PAYLOAD_NAME_ITLB_ENTRIES,
+			    DATA_TYPE_UINT8, num_entries, NULL);
+			fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_ITLB_DATA,
+			    DATA_TYPE_UINT64_ARRAY, tlb_data_words,
+			    (uint64_t *)ch_flt->tlb_diag_data.tlo_itlb_tte,
+			    NULL);
+		}
+	} else {
+		num_entries = (uint8_t)(PN_DTLB_NWAYS * PN_NUM_512_DTLBS);
+		tlb_data_words = sizeof (ch_tte_entry_t) / sizeof (uint64_t) *
+		    num_entries;
+
+		fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_VA,
+		    DATA_TYPE_UINT64, ch_flt->tlb_diag_data.tlo_addr, NULL);
+
+		/*
+		 * Add the TLB diagnostic data to the payload
+		 * if it was collected.
+		 */
+		if (ch_flt->tlb_diag_data.tlo_logflag == DT_LOGFLAG_MAGIC) {
+			fm_payload_set(nvl,
+			    FM_EREPORT_PAYLOAD_NAME_DTLB_ENTRIES,
+			    DATA_TYPE_UINT8, num_entries, NULL);
+			fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_DTLB_DATA,
+			    DATA_TYPE_UINT64_ARRAY, tlb_data_words,
+			    (uint64_t *)ch_flt->tlb_diag_data.tlo_dtlb_tte,
+			    NULL);
+		}
+	}
+}
+
+/*
+ * Panther Cache Scrubbing:
+ *
+ * In Jaguar, the E$ was split between cores, so the scrubber must run on both
+ * cores.  For Panther, however, the L2$ and L3$ are shared across cores.
+ * Therefore, the E$ scrubber only needs to run on one of the two cores.
+ *
+ * There are four possible states for the E$ scrubber:
+ *
+ * 0. If both cores are offline, add core 0 to cpu_offline_set so that
+ *    the offline scrubber will run on it.
+ * 1. If core 0 is online and core 1 off, we run the scrubber on core 0.
+ * 2. If core 1 is online and core 0 off, we move the scrubber to run
+ *    on core 1.
+ * 3. If both cores are online, only run the scrubber on core 0.
+ *
+ * These states are enumerated by the SCRUBBER_[BOTH|CORE|NEITHER]_* defines
+ * above.  One of those values is stored in
+ * chpr_scrub_misc->chsm_core_state on each core.
+ *
+ * Also note that, for Panther, ecache_flush_line() will flush out the L2$
+ * before the E$, so the L2$ will be scrubbed by the E$ scrubber.  No
+ * additional code is necessary to scrub the L2$.
+ *
+ * For all cpu types, whenever a cpu or core is offlined, add it to
+ * cpu_offline_set so the necessary scrubbers can still run.  This is still
+ * necessary on Panther so the D$ scrubber can still run.
+ */
+/*ARGSUSED*/
+int
+cpu_scrub_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
+{
+	processorid_t core_0_id;
+	cpu_t *core_cpus[2];
+	ch_scrub_misc_t *core_scrub[2];
+	int old_state, i;
+	int new_state = SCRUBBER_NEITHER_CORE_ONLINE;
+
+	switch (what) {
+	case CPU_ON:
+	case CPU_INIT:
+		CPUSET_DEL(cpu_offline_set, cpuid);
+		break;
+	case CPU_OFF:
+		CPUSET_ADD(cpu_offline_set, cpuid);
+		break;
+	default:
+		return (0);
+	}
+
+	if (!IS_PANTHER(cpunodes[cpuid].implementation)) {
+		return (0);
+	}
+
+	/*
+	 * Update the chsm_enable[CACHE_SCRUBBER_INFO_E] value
+	 * if necessary
+	 */
+	core_0_id = cmp_cpu_to_chip(cpuid);
+	core_cpus[0] = cpu_get(core_0_id);
+	core_cpus[1] = cpu_get_sibling_core(core_cpus[0]);
+
+	for (i = 0; i < 2; i++) {
+		if (core_cpus[i] == NULL) {
+			/*
+			 * This should only happen if one of the two cores is
+			 * blacklisted, which should only happen when we're
+			 * doing hardware bringup or debugging.  Give up and
+			 * return quietly.
+			 */
+			return (0);
+		}
+		core_scrub[i] = CPU_PRIVATE_PTR(core_cpus[i], chpr_scrub_misc);
+	}
+
+	if (cpuid == (processorid_t)cmp_cpu_to_chip(cpuid)) {
+		/* cpuid is core 0 */
+		if (cpu_is_active(core_cpus[1])) {
+			new_state |= SCRUBBER_CORE_1_ONLINE;
+		}
+		if (what != CPU_OFF) {
+			new_state |= SCRUBBER_CORE_0_ONLINE;
+		}
+	} else {
+		/* cpuid is core 1 */
+		if (cpu_is_active(core_cpus[0])) {
+			new_state |= SCRUBBER_CORE_0_ONLINE;
+		}
+		if (what != CPU_OFF) {
+			new_state |= SCRUBBER_CORE_1_ONLINE;
+		}
+	}
+
+	old_state = core_scrub[0]->chsm_core_state;
+
+	if (old_state == new_state) {
+		return (0);
+	}
+
+	if (old_state == SCRUBBER_CORE_1_ONLINE) {
+		/*
+		 * We need to move the scrubber state from core 1
+		 * back to core 0.  This data is not protected by
+		 * locks, but the worst that can happen is some
+		 * lines are scrubbed multiple times.  chsm_oustanding is
+		 * set to 0 to make sure an interrupt is scheduled the
+		 * first time through do_scrub().
+		 */
+		core_scrub[0]->chsm_flush_index[CACHE_SCRUBBER_INFO_E] =
+		    core_scrub[1]->chsm_flush_index[CACHE_SCRUBBER_INFO_E];
+		core_scrub[0]->chsm_outstanding[CACHE_SCRUBBER_INFO_E] = 0;
+	}
+
+	switch (new_state) {
+	case SCRUBBER_NEITHER_CORE_ONLINE:
+	case SCRUBBER_BOTH_CORES_ONLINE:
+	case SCRUBBER_CORE_0_ONLINE:
+		core_scrub[1]->chsm_enable[CACHE_SCRUBBER_INFO_E] = 0;
+		core_scrub[0]->chsm_enable[CACHE_SCRUBBER_INFO_E] = 1;
+		break;
+
+	case SCRUBBER_CORE_1_ONLINE:
+	default:
+		/*
+		 * We need to move the scrubber state from core 0
+		 * to core 1.
+		 */
+		core_scrub[1]->chsm_flush_index[CACHE_SCRUBBER_INFO_E] =
+		    core_scrub[0]->chsm_flush_index[CACHE_SCRUBBER_INFO_E];
+		core_scrub[1]->chsm_outstanding[CACHE_SCRUBBER_INFO_E] = 0;
+
+		core_scrub[0]->chsm_enable[CACHE_SCRUBBER_INFO_E] = 0;
+		core_scrub[1]->chsm_enable[CACHE_SCRUBBER_INFO_E] = 1;
+		break;
+	}
+
+	core_scrub[0]->chsm_core_state = new_state;
+	core_scrub[1]->chsm_core_state = new_state;
+	return (0);
+}
+
+/*
+ * Returns a pointer to the cpu structure of the argument's sibling core.
+ * If no sibling core can be found, return NULL.
+ */
+static cpu_t *
+cpu_get_sibling_core(cpu_t *cpup)
+{
+	cpu_t *nextp;
+
+	if (!cmp_cpu_is_cmp(cpup->cpu_id))
+		return (NULL);
+
+	nextp = cpup->cpu_next_chip;
+	if ((nextp == NULL) || (nextp == cpup))
+		return (NULL);
+
+	return (nextp);
+}
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetahplus_asm.s b/usr/src/uts/sun4u/cpu/us3_cheetahplus_asm.s
new file mode 100644
index 0000000000..2dd4852312
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_cheetahplus_asm.s
@@ -0,0 +1,989 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Assembly code support for the Cheetah+ module
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* lint */
+
+#include <sys/asm_linkage.h>
+#include <sys/mmu.h>
+#include <vm/hat_sfmmu.h>
+#include <sys/machparam.h>
+#include <sys/machcpuvar.h>
+#include <sys/machthread.h>
+#include <sys/machtrap.h>
+#include <sys/privregs.h>
+#include <sys/asm_linkage.h>
+#include <sys/trap.h>
+#include <sys/cheetahregs.h>
+#include <sys/us3_module.h>
+#include <sys/xc_impl.h>
+#include <sys/intreg.h>
+#include <sys/async.h>
+#include <sys/clock.h>
+#include <sys/cheetahasm.h>
+
+#ifdef TRAPTRACE
+#include <sys/traptrace.h>
+#endif /* TRAPTRACE */
+
+#if !defined(lint)
+
+/* BEGIN CSTYLED */
+
+/*
+ * Cheetah+ version to reflush an Ecache line by index.
+ *
+ * By default we assume the Ecache is 2-way so we flush both
+ * ways. Even if the cache is direct-mapped no harm will come
+ * from performing the flush twice, apart from perhaps a performance
+ * penalty.
+ *
+ * XXX - scr2 not used.
+ */
+#define	ECACHE_REFLUSH_LINE(ec_set_size, index, scr2)			\
+	ldxa	[index]ASI_EC_DIAG, %g0;				\
+	ldxa	[index + ec_set_size]ASI_EC_DIAG, %g0;
+
+/*
+ * Cheetah+ version of ecache_flush_line.  Uses Cheetah+ Ecache Displacement
+ * Flush feature.
+ */
+#define	ECACHE_FLUSH_LINE(physaddr, ec_set_size, scr1, scr2)		\
+	sub	ec_set_size, 1, scr1;					\
+	and	physaddr, scr1, scr1;					\
+	set	CHP_ECACHE_IDX_DISP_FLUSH, scr2;			\
+	or	scr2, scr1, scr1;					\
+	ECACHE_REFLUSH_LINE(ec_set_size, scr1, scr2)
+
+/* END CSTYLED */
+
+/*
+ * Panther version to reflush a line from both the L2 cache and L3
+ * cache by the respective indexes. Flushes all ways of the line from
+ * each cache.
+ *
+ * l2_index	Index into the L2$ of the line to be flushed. This
+ *		register will not be modified by this routine.
+ * l3_index	Index into the L3$ of the line to be flushed. This
+ *		register will not be modified by this routine.
+ * scr2		scratch register.
+ * scr3		scratch register.
+ *
+ */
+#define	PN_ECACHE_REFLUSH_LINE(l2_index, l3_index, scr2, scr3)		\
+	set	PN_L2_MAX_SET, scr2;					\
+	set	PN_L2_SET_SIZE, scr3;					\
+1:									\
+	ldxa	[l2_index + scr2]ASI_L2_TAG, %g0;			\
+	cmp	scr2, %g0;						\
+	bg,a	1b;							\
+	  sub	scr2, scr3, scr2;					\
+	set	PN_L3_MAX_SET, scr2;					\
+	set	PN_L3_SET_SIZE, scr3;					\
+2:									\
+	ldxa	[l3_index + scr2]ASI_EC_DIAG, %g0;			\
+	cmp	scr2, %g0;						\
+	bg,a	2b;							\
+	  sub	scr2, scr3, scr2;
+
+
+/*
+ * Panther version of ecache_flush_line. Flushes the line corresponding
+ * to physaddr from both the L2 cache and the L3 cache.
+ *
+ * physaddr	Input: Physical address to flush.
+ *              Output: Physical address to flush (preserved).
+ * l2_idx_out	Input: scratch register.
+ *              Output: Index into the L2$ of the line to be flushed.
+ * l3_idx_out	Input: scratch register.
+ *              Output: Index into the L3$ of the line to be flushed.
+ * scr3		scratch register.
+ * scr4		scratch register.
+ *
+ */
+#define	PN_ECACHE_FLUSH_LINE(physaddr, l2_idx_out, l3_idx_out, scr3, scr4)	\
+	set	PN_L3_SET_SIZE, l2_idx_out;					\
+	sub	l2_idx_out, 1, l2_idx_out;					\
+	and	physaddr, l2_idx_out, l3_idx_out;				\
+	set	PN_L3_IDX_DISP_FLUSH, l2_idx_out;				\
+	or	l2_idx_out, l3_idx_out, l3_idx_out;				\
+	set	PN_L2_SET_SIZE, l2_idx_out;					\
+	sub	l2_idx_out, 1, l2_idx_out;					\
+	and	physaddr, l2_idx_out, l2_idx_out;				\
+	set	PN_L2_IDX_DISP_FLUSH, scr3;					\
+	or	l2_idx_out, scr3, l2_idx_out;					\
+	PN_ECACHE_REFLUSH_LINE(l2_idx_out, l3_idx_out, scr3, scr4)
+
+#endif	/* !lint */
+
+/*
+ * Fast ECC error at TL>0 handler
+ * We get here via trap 70 at TL>0->Software trap 0 at TL>0.  We enter
+ * this routine with %g1 and %g2 already saved in %tpc, %tnpc and %tstate.
+ * For a complete description of the Fast ECC at TL>0 handling see the
+ * comment block "Cheetah/Cheetah+ Fast ECC at TL>0 trap strategy" in
+ * us3_common_asm.s
+ */
+#if defined(lint)
+
+void
+fast_ecc_tl1_err(void)
+{}
+
+#else	/* lint */
+
+	.section ".text"
+	.align	64
+	ENTRY_NP(fast_ecc_tl1_err)
+
+	/*
+	 * This macro turns off the D$/I$ if they are on and saves their
+	 * original state in ch_err_tl1_tmp, saves all the %g registers in the
+	 * ch_err_tl1_data structure, updates the ch_err_tl1_flags and saves
+	 * the %tpc in ch_err_tl1_tpc.  At the end of this macro, %g1 will
+	 * point to the ch_err_tl1_data structure and the original D$/I$ state
+	 * will be saved in ch_err_tl1_tmp.  All %g registers except for %g1
+	 * will be available.
+	 */
+	CH_ERR_TL1_FECC_ENTER;
+
+	/*
+	 * Get the diagnostic logout data.  %g4 must be initialized to
+	 * current CEEN state, %g5 must point to logout structure in
+	 * ch_err_tl1_data_t.  %g3 will contain the nesting count upon
+	 * return.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g4
+	and	%g4, EN_REG_CEEN, %g4
+	add	%g1, CH_ERR_TL1_LOGOUT, %g5
+	DO_TL1_CPU_LOGOUT(%g3, %g2, %g4, %g5, %g6, %g3, %g4)
+
+	/*
+	 * If the logout nesting count is exceeded, we're probably
+	 * not making any progress, try to panic instead.
+	 */
+	cmp	%g3, CLO_NESTING_MAX
+	bge	fecc_tl1_err
+	  nop
+
+	/*
+	 * Save the current CEEN and NCEEN state in %g7 and turn them off
+	 * before flushing the Ecache.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g7
+	andn	%g7, EN_REG_CEEN | EN_REG_NCEEN, %g5
+	stxa	%g5, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/*
+	 * Flush the Ecache, using the largest possible cache size with the
+	 * smallest possible line size since we can't get the actual sizes
+	 * from the cpu_node due to DTLB misses.
+	 */
+	PN_L2_FLUSHALL(%g3, %g4, %g5)
+
+	set	CH_ECACHE_MAX_SIZE, %g4
+	set	CH_ECACHE_MIN_LSIZE, %g5
+
+	GET_CPU_IMPL(%g6)
+	cmp	%g6, PANTHER_IMPL
+	bne	%xcc, 2f
+	  nop
+	set	PN_L3_SIZE, %g4
+2:
+	mov	%g6, %g3
+	CHP_ECACHE_FLUSHALL(%g4, %g5, %g3)
+
+	/*
+	 * Restore CEEN and NCEEN to the previous state.
+	 */
+	stxa	%g7, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/*
+	 * If we turned off the D$, then flush it and turn it back on.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_TMP]%asi, %g3
+	andcc	%g3, CH_ERR_TSTATE_DC_ON, %g0
+	bz	%xcc, 3f
+	  nop
+
+	/*
+	 * Flush the D$.
+	 */
+	ASM_LD(%g4, dcache_size)
+	ASM_LD(%g5, dcache_linesize)
+	CH_DCACHE_FLUSHALL(%g4, %g5, %g6)
+
+	/*
+	 * Turn the D$ back on.
+	 */
+	ldxa	[%g0]ASI_DCU, %g3
+	or	%g3, DCU_DC, %g3
+	stxa	%g3, [%g0]ASI_DCU
+	membar	#Sync
+3:
+	/*
+	 * If we turned off the I$, then flush it and turn it back on.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_TMP]%asi, %g3
+	andcc	%g3, CH_ERR_TSTATE_IC_ON, %g0
+	bz	%xcc, 4f
+	  nop
+
+	/*
+	 * Flush the I$.  Panther has different I$ parameters, and we
+	 * can't access the logout I$ params without possibly generating
+	 * a MMU miss.
+	 */
+	GET_CPU_IMPL(%g6)
+	set	PN_ICACHE_SIZE, %g3
+	set	CH_ICACHE_SIZE, %g4
+	mov	CH_ICACHE_LSIZE, %g5
+	cmp	%g6, PANTHER_IMPL
+	movz	%xcc, %g3, %g4
+	movz	%xcc, PN_ICACHE_LSIZE, %g5
+	CH_ICACHE_FLUSHALL(%g4, %g5, %g6, %g3)
+
+	/*
+	 * Turn the I$ back on.  Changing DCU_IC requires flush.
+	 */
+	ldxa	[%g0]ASI_DCU, %g3
+	or	%g3, DCU_IC, %g3
+	stxa	%g3, [%g0]ASI_DCU
+	flush	%g0
+4:
+
+#ifdef TRAPTRACE
+	/*
+	 * Get current trap trace entry physical pointer.
+	 */
+	CPU_INDEX(%g6, %g5)
+	sll	%g6, TRAPTR_SIZE_SHIFT, %g6
+	set	trap_trace_ctl, %g5
+	add	%g6, %g5, %g6
+	ld	[%g6 + TRAPTR_LIMIT], %g5
+	tst	%g5
+	be	%icc, skip_traptrace
+	  nop
+	ldx	[%g6 + TRAPTR_PBASE], %g5
+	ld	[%g6 + TRAPTR_OFFSET], %g4
+	add	%g5, %g4, %g5
+
+	/*
+	 * Create trap trace entry.
+	 */
+	rd	%asi, %g7
+	wr	%g0, TRAPTR_ASI, %asi
+	rd	STICK, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TICK]%asi
+	rdpr	%tl, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TL]%asi
+	rdpr	%tt, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TT]%asi
+	rdpr	%tpc, %g4
+	stna	%g4, [%g5 + TRAP_ENT_TPC]%asi
+	rdpr	%tstate, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TSTATE]%asi
+	stna	%sp, [%g5 + TRAP_ENT_SP]%asi
+	stna	%g0, [%g5 + TRAP_ENT_TR]%asi
+	wr	%g0, %g7, %asi
+	ldxa	[%g1 + CH_ERR_TL1_SDW_AFAR]%asi, %g3
+	ldxa	[%g1 + CH_ERR_TL1_SDW_AFSR]%asi, %g4
+	wr	%g0, TRAPTR_ASI, %asi
+	stna	%g3, [%g5 + TRAP_ENT_F1]%asi
+	stna	%g4, [%g5 + TRAP_ENT_F2]%asi
+	wr	%g0, %g7, %asi
+	ldxa	[%g1 + CH_ERR_TL1_AFAR]%asi, %g3
+	ldxa	[%g1 + CH_ERR_TL1_AFSR]%asi, %g4
+	wr	%g0, TRAPTR_ASI, %asi
+	stna	%g3, [%g5 + TRAP_ENT_F3]%asi
+	stna	%g4, [%g5 + TRAP_ENT_F4]%asi
+	wr	%g0, %g7, %asi
+
+	/*
+	 * Advance trap trace pointer.
+	 */
+	ld	[%g6 + TRAPTR_OFFSET], %g5
+	ld	[%g6 + TRAPTR_LIMIT], %g4
+	st	%g5, [%g6 + TRAPTR_LAST_OFFSET]
+	add	%g5, TRAP_ENT_SIZE, %g5
+	sub	%g4, TRAP_ENT_SIZE, %g4
+	cmp	%g5, %g4
+	movge	%icc, 0, %g5
+	st	%g5, [%g6 + TRAPTR_OFFSET]
+skip_traptrace:
+#endif	/* TRAPTRACE */
+
+	/*
+	 * If nesting count is not zero, skip all the AFSR/AFAR
+	 * handling and just do the necessary cache-flushing.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_NEST_CNT]%asi, %g2
+	brnz	%g2, 6f
+	  nop
+
+	/*
+	 * If a UCU or L3_UCU followed by a WDU has occurred go ahead
+	 * and panic since a UE will occur (on the retry) before the
+	 * UCU and WDU messages are enqueued.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_SDW_AFSR]%asi, %g3
+	set	1, %g4
+	sllx	%g4, C_AFSR_UCU_SHIFT, %g4
+	btst	%g4, %g3		! UCU in original shadow AFSR?
+	bnz	%xcc, 5f
+	  mov	1, %g4
+	ldxa	[%g1 + CH_ERR_TL1_SDW_AFSR_EXT]%asi, %g3
+	sllx	%g4, C_AFSR_L3_UCU_SHIFT, %g4
+	btst	%g4, %g3		! L3_UCU in original shadow AFSR_EXT?
+	bz	%xcc, 6f
+	  nop
+5:
+	ldxa	[%g1 + CH_ERR_TL1_AFSR]%asi, %g4	! original AFSR
+	ldxa	[%g0]ASI_AFSR, %g3	! current AFSR
+	or	%g3, %g4, %g3		! %g3 = original + current AFSR
+	set	1, %g4
+	sllx	%g4, C_AFSR_WDU_SHIFT, %g4
+	btst	%g4, %g3		! WDU in original or current AFSR?
+	bnz	%xcc, fecc_tl1_err
+	  nop
+
+6:
+	/*
+	 * We fall into this macro if we've successfully logged the error in
+	 * the ch_err_tl1_data structure and want the PIL15 softint to pick
+	 * it up and log it.  %g1 must point to the ch_err_tl1_data structure.
+	 * Restores the %g registers and issues retry.
+	 */
+	CH_ERR_TL1_EXIT;
+
+	/*
+	 * Establish panic exit label.
+	 */
+	CH_ERR_TL1_PANIC_EXIT(fecc_tl1_err);
+
+	SET_SIZE(fast_ecc_tl1_err)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * scrubphys - Pass in the aligned physical memory address
+ * that you want to scrub, along with the ecache set size.
+ *
+ *	1) Displacement flush the E$ line corresponding to %addr.
+ *	   The first ldxa guarantees that the %addr is no longer in
+ *	   M, O, or E (goes to I or S (if instruction fetch also happens).
+ *	2) "Write" the data using a CAS %addr,%g0,%g0.
+ *	   The casxa guarantees a transition from I to M or S to M.
+ *	3) Displacement flush the E$ line corresponding to %addr.
+ *	   The second ldxa pushes the M line out of the ecache, into the
+ *	   writeback buffers, on the way to memory.
+ *	4) The "membar #Sync" pushes the cache line out of the writeback
+ *	   buffers onto the bus, on the way to dram finally.
+ *
+ * This is a modified version of the algorithm suggested by Gary Lauterbach.
+ * In theory the CAS %addr,%g0,%g0 is supposed to mark the addr's cache line
+ * as modified, but then we found out that for spitfire, if it misses in the
+ * E$ it will probably install as an M, but if it hits in the E$, then it
+ * will stay E, if the store doesn't happen. So the first displacement flush
+ * should ensure that the CAS will miss in the E$.  Arrgh.
+ */
+/* ARGSUSED */
+void
+scrubphys(uint64_t paddr, int ecache_set_size)
+{}
+
+#else	/* lint */
+	ENTRY(scrubphys)
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate	! clear IE, AM bits
+
+	GET_CPU_IMPL(%o5)		! Panther Ecache is flushed differently
+	cmp	%o5, PANTHER_IMPL
+	bne	scrubphys_1
+	  nop
+	PN_ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3, %o5)
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	PN_ECACHE_REFLUSH_LINE(%o1, %o2, %o3, %o0)
+	b	scrubphys_2
+	  nop
+scrubphys_1:
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	ECACHE_REFLUSH_LINE(%o1, %o2, %o3)
+scrubphys_2:
+	wrpr	%g0, %o4, %pstate	! restore earlier pstate register value
+
+	retl
+	membar	#Sync			! move the data out of the load buffer
+	SET_SIZE(scrubphys)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * clearphys - Pass in the aligned physical memory address
+ * that you want to push out, as a ecache_linesize byte block of zeros,
+ * from the ecache zero-filled.
+ */
+/* ARGSUSED */
+void
+clearphys(uint64_t paddr, int ecache_set_size, int ecache_linesize)
+{
+}
+
+#else	/* lint */
+	ENTRY(clearphys)
+	/* turn off IE, AM bits */
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate
+
+	/* turn off NCEEN */
+	ldxa	[%g0]ASI_ESTATE_ERR, %o5
+	andn	%o5, EN_REG_NCEEN, %o3
+	stxa	%o3, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/* zero the E$ line */
+clearphys_1:
+	subcc	%o2, 8, %o2
+	bge	clearphys_1
+	  stxa	%g0, [%o0 + %o2]ASI_MEM
+
+	GET_CPU_IMPL(%o3)		! Panther Ecache is flushed differently
+	cmp	%o3, PANTHER_IMPL
+	bne	clearphys_2
+	  nop
+	PN_ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3, %g1)
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	PN_ECACHE_REFLUSH_LINE(%o1, %o2, %o3, %o0)
+	b	clearphys_3
+	  nop
+clearphys_2:
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	ECACHE_REFLUSH_LINE(%o1, %o2, %o3)
+clearphys_3:
+	/* clear the AFSR */
+	ldxa	[%g0]ASI_AFSR, %o1
+	stxa	%o1, [%g0]ASI_AFSR
+	membar	#Sync
+
+	/* turn NCEEN back on */
+	stxa	%o5, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/* return and re-enable IE and AM */
+	retl
+	  wrpr	%g0, %o4, %pstate
+	SET_SIZE(clearphys)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * Cheetah+ Ecache displacement flush the specified line from the E$
+ *
+ * For Panther, this means flushing the specified line from both the
+ * L2 cache and L3 cache.
+ *
+ * Register usage:
+ *	%o0 - 64 bit physical address for flushing
+ *	%o1 - Ecache set size
+ */
+/*ARGSUSED*/
+void
+ecache_flush_line(uint64_t flushaddr, int ec_set_size)
+{
+}
+#else	/* lint */
+	ENTRY(ecache_flush_line)
+
+	GET_CPU_IMPL(%o3)		! Panther Ecache is flushed differently
+	cmp	%o3, PANTHER_IMPL
+	bne	ecache_flush_line_1
+	  nop
+
+	PN_ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3, %o4)
+	b	ecache_flush_line_2
+	  nop
+ecache_flush_line_1:
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+ecache_flush_line_2:
+	retl
+	  nop
+	SET_SIZE(ecache_flush_line)
+#endif	/* lint */
+
+#if defined(lint)
+void
+set_afsr_ext(uint64_t afsr_ext)
+{
+	afsr_ext = afsr_ext;
+}
+#else /* lint */
+
+	ENTRY(set_afsr_ext)
+	set	ASI_AFSR_EXT_VA, %o1
+	stxa	%o0, [%o1]ASI_AFSR		! afsr_ext reg
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(set_afsr_ext)
+
+#endif /* lint */
+
+
+#if defined(lint)
+/*
+ * The CPU jumps here from the MMU exception handler if an ITLB parity
+ * error is detected and we are running on Panther.
+ *
+ * In this routine we collect diagnostic information and write it to our
+ * logout structure (if possible) and clear all ITLB entries that may have
+ * caused our parity trap.
+ * Then we call cpu_tlb_parity_error via systrap in order to drop down to TL0
+ * and log any error messages. As for parameters to cpu_tlb_parity_error, we
+ * send two:
+ *
+ * %g2	- Contains the VA whose lookup in the ITLB caused the parity error
+ * %g3	- Contains the tlo_info field of the pn_tlb_logout logout struct,
+ *	  regardless of whether or not we actually used the logout struct.
+ *
+ * In the TL0 handler (cpu_tlb_parity_error) we will compare those two
+ * parameters to the data contained in the logout structure in order to
+ * determine whether the logout information is valid for this particular
+ * error or not.
+ */
+void
+itlb_parity_trap(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(itlb_parity_trap)
+	/*
+	 * Collect important information about the trap which will be
+	 * used as a parameter to the TL0 handler.
+	 */
+	wr	%g0, ASI_IMMU, %asi
+	rdpr	%tpc, %g2			! VA that caused the IMMU trap
+	ldxa	[MMU_TAG_ACCESS_EXT]%asi, %g3	! read the trap VA page size
+	set	PN_ITLB_PGSZ_MASK, %g4
+	and	%g3, %g4, %g3
+	ldxa	[MMU_TAG_ACCESS]%asi, %g4
+	set	TAGREAD_CTX_MASK, %g5
+	and	%g4, %g5, %g4
+	or	%g4, %g3, %g3			! 'or' in the trap context and
+	mov	1, %g4				! add the IMMU flag to complete
+	sllx	%g4, PN_TLO_INFO_IMMU_SHIFT, %g4
+	or	%g4, %g3, %g3			! the tlo_info field for logout
+	stxa	%g0,[MMU_SFSR]%asi		! clear the SFSR
+	membar	#Sync
+
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *
+	 * Next, we calculate the TLB index value for the failing VA.
+	 */
+	mov	%g2, %g4			! We need the ITLB index
+	set	PN_ITLB_PGSZ_MASK, %g5
+	and	%g3, %g5, %g5
+	srlx	%g5, PN_ITLB_PGSZ_SHIFT, %g5
+	PN_GET_TLB_INDEX(%g4, %g5)		! %g4 has the index
+	sllx	%g4, PN_TLB_ACC_IDX_SHIFT, %g4	! shift the index into place
+	set	PN_ITLB_T512, %g5
+	or	%g4, %g5, %g4			! and add in the TLB ID
+
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *    %g4 - contains the TLB access index value for the
+	 *          VA/PgSz in question
+	 *
+	 * Check to see if the logout structure is available.
+	 */
+	set	CHPR_TLB_LOGOUT, %g6
+	GET_CPU_PRIVATE_PTR(%g6, %g1, %g5, itlb_parity_trap_1)
+	set	LOGOUT_INVALID_U32, %g6
+	sllx	%g6, 32, %g6			! if our logout structure is
+	set	LOGOUT_INVALID_L32, %g5		! unavailable or if it is
+	or	%g5, %g6, %g5			! already being used, then we
+	ldx	[%g1 + PN_TLO_ADDR], %g6	! don't collect any diagnostic
+	cmp	%g6, %g5			! information before clearing
+	bne	itlb_parity_trap_1		! and logging the error.
+	  nop
+
+	/*
+	 * Record the logout information. %g4 contains our index + TLB ID
+	 * for use in ASI_ITLB_ACCESS and ASI_ITLB_TAGREAD. %g1 contains
+	 * the pointer to our logout struct.
+	 */
+	stx	%g3, [%g1 + PN_TLO_INFO]
+	stx	%g2, [%g1 + PN_TLO_ADDR]
+	stx	%g2, [%g1 + PN_TLO_PC]		! %tpc == fault addr for IMMU
+
+	add	%g1, PN_TLO_ITLB_TTE, %g1	! move up the pointer
+
+	ldxa	[%g4]ASI_ITLB_ACCESS, %g5	! read the data
+	stx	%g5, [%g1 + CH_TLO_TTE_DATA]	! store it away
+	ldxa	[%g4]ASI_ITLB_TAGREAD, %g5	! read the tag
+	stx	%g5, [%g1 + CH_TLO_TTE_TAG]	! store it away
+
+	set	PN_TLB_ACC_WAY_BIT, %g6		! same thing again for way 1
+	or	%g4, %g6, %g4
+	add	%g1, CH_TLO_TTE_SIZE, %g1	! move up the pointer
+
+	ldxa	[%g4]ASI_ITLB_ACCESS, %g5	! read the data
+	stx	%g5, [%g1 + CH_TLO_TTE_DATA]	! store it away
+	ldxa	[%g4]ASI_ITLB_TAGREAD, %g5	! read the tag
+	stx	%g5, [%g1 + CH_TLO_TTE_TAG]	! store it away
+
+	andn	%g4, %g6, %g4			! back to way 0
+
+itlb_parity_trap_1:
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *    %g4 - contains the TLB access index value for the
+	 *          VA/PgSz in question
+	 *
+	 * Here we will clear the errors from the TLB.
+	 */
+	set	MMU_TAG_ACCESS, %g5		! We write a TTE tag value of
+	stxa	%g0, [%g5]ASI_IMMU		! 0 as it will be invalid.
+	stxa	%g0, [%g4]ASI_ITLB_ACCESS	! Write the data and tag
+	membar	#Sync
+
+	set	PN_TLB_ACC_WAY_BIT, %g6		! same thing again for way 1
+	or	%g4, %g6, %g4
+
+	stxa	%g0, [%g4]ASI_ITLB_ACCESS	! Write same data and tag
+	membar	#Sync
+
+	sethi	%hi(FLUSH_ADDR), %g6		! PRM says we need to issue a
+	flush   %g6				! flush after writing MMU regs
+
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *
+	 * Call cpu_tlb_parity_error via systrap at PIL 14 unless we're
+	 * already at PIL 15.	 */
+	set	cpu_tlb_parity_error, %g1
+	rdpr	%pil, %g4
+	cmp	%g4, PIL_14
+	movl	%icc, PIL_14, %g4
+	ba	sys_trap
+	  nop
+	SET_SIZE(itlb_parity_trap)
+
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ * The CPU jumps here from the MMU exception handler if a DTLB parity
+ * error is detected and we are running on Panther.
+ *
+ * In this routine we collect diagnostic information and write it to our
+ * logout structure (if possible) and clear all DTLB entries that may have
+ * caused our parity trap.
+ * Then we call cpu_tlb_parity_error via systrap in order to drop down to TL0
+ * and log any error messages. As for parameters to cpu_tlb_parity_error, we
+ * send two:
+ *
+ * %g2	- Contains the VA whose lookup in the DTLB caused the parity error
+ * %g3	- Contains the tlo_info field of the pn_tlb_logout logout struct,
+ *	  regardless of whether or not we actually used the logout struct.
+ *
+ * In the TL0 handler (cpu_tlb_parity_error) we will compare those two
+ * parameters to the data contained in the logout structure in order to
+ * determine whether the logout information is valid for this particular
+ * error or not.
+ */
+void
+dtlb_parity_trap(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(dtlb_parity_trap)
+	/*
+	 * Collect important information about the trap which will be
+	 * used as a parameter to the TL0 handler.
+	 */
+	wr	%g0, ASI_DMMU, %asi
+	ldxa	[MMU_SFAR]%asi, %g2		! VA that caused the IMMU trap
+	ldxa	[MMU_TAG_ACCESS_EXT]%asi, %g3	! read the trap VA page sizes
+	set	PN_DTLB_PGSZ_MASK, %g4
+	and	%g3, %g4, %g3
+	ldxa	[MMU_TAG_ACCESS]%asi, %g4
+	set	TAGREAD_CTX_MASK, %g5		! 'or' in the trap context
+	and	%g4, %g5, %g4			! to complete the tlo_info
+	or	%g4, %g3, %g3			! field for logout
+	stxa	%g0,[MMU_SFSR]%asi		! clear the SFSR
+	membar	#Sync
+
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *
+	 * Calculate the TLB index values for the failing VA. Since the T512
+	 * TLBs can be configured for different page sizes, we need to find
+	 * the index into each one separately.
+	 */
+	mov	%g2, %g4			! First we get the DTLB_0 index
+	set	PN_DTLB_PGSZ0_MASK, %g5
+	and	%g3, %g5, %g5
+	srlx	%g5, PN_DTLB_PGSZ0_SHIFT, %g5
+	PN_GET_TLB_INDEX(%g4, %g5)		! %g4 has the DTLB_0 index
+	sllx	%g4, PN_TLB_ACC_IDX_SHIFT, %g4	! shift the index into place
+	set	PN_DTLB_T512_0, %g5
+	or	%g4, %g5, %g4			! and add in the TLB ID
+
+	mov	%g2, %g7			! Next we get the DTLB_1 index
+	set	PN_DTLB_PGSZ1_MASK, %g5
+	and	%g3, %g5, %g5
+	srlx	%g5, PN_DTLB_PGSZ1_SHIFT, %g5
+	PN_GET_TLB_INDEX(%g7, %g5)		! %g7 has the DTLB_1 index
+	sllx	%g7, PN_TLB_ACC_IDX_SHIFT, %g7	! shift the index into place
+	set	PN_DTLB_T512_1, %g5
+	or	%g7, %g5, %g7			! and add in the TLB ID
+
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *    %g4 - contains the T512_0 access index value for the
+	 *          VA/PgSz in question
+	 *    %g7 - contains the T512_1 access index value for the
+	 *          VA/PgSz in question
+	 *
+	 * If this trap happened at TL>0, then we don't want to mess
+	 * with the normal logout struct since that could caused a TLB
+	 * miss.
+	 */
+	rdpr	%tl, %g6			! read current trap level
+	cmp	%g6, 1				! skip over the tl>1 code
+	ble	dtlb_parity_trap_1		! if TL <= 1.
+	  nop
+
+	/*
+	 * If we are here, then the trap happened at TL>1. Simply
+	 * update our tlo_info field and then skip to the TLB flush
+	 * code.
+	 */
+	mov	1, %g6
+	sllx	%g6, PN_TLO_INFO_TL1_SHIFT, %g6
+	or	%g6, %g3, %g3
+	ba	dtlb_parity_trap_2
+	  nop
+
+dtlb_parity_trap_1:
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *    %g4 - contains the T512_0 access index value for the
+	 *          VA/PgSz in question
+	 *    %g7 - contains the T512_1 access index value for the
+	 *          VA/PgSz in question
+	 *
+	 * Check to see if the logout structure is available.
+	 */
+	set	CHPR_TLB_LOGOUT, %g6
+	GET_CPU_PRIVATE_PTR(%g6, %g1, %g5, dtlb_parity_trap_2)
+	set	LOGOUT_INVALID_U32, %g6
+	sllx	%g6, 32, %g6			! if our logout structure is
+	set	LOGOUT_INVALID_L32, %g5		! unavailable or if it is
+	or	%g5, %g6, %g5			! already being used, then we
+	ldx	[%g1 + PN_TLO_ADDR], %g6	! don't collect any diagnostic
+	cmp	%g6, %g5			! information before clearing
+	bne	dtlb_parity_trap_2		! and logging the error.
+	  nop
+
+	/*
+	 * Record the logout information. %g4 contains our DTLB_0 
+	 * index + TLB ID and %g7 contains our DTLB_1 index + TLB ID
+	 * both of which will be used for ASI_DTLB_ACCESS and
+	 * ASI_DTLB_TAGREAD. %g1 contains the pointer to our logout
+	 * struct.
+	 */
+	stx	%g3, [%g1 + PN_TLO_INFO]
+	stx	%g2, [%g1 + PN_TLO_ADDR]
+	rdpr	%tpc, %g5
+	stx	%g5, [%g1 + PN_TLO_PC]
+
+	add	%g1, PN_TLO_DTLB_TTE, %g1	! move up the pointer
+
+	ldxa	[%g4]ASI_DTLB_ACCESS, %g5	! read the data from DTLB_0
+	stx	%g5, [%g1 + CH_TLO_TTE_DATA]	! way 0 and store it away
+	ldxa	[%g4]ASI_DTLB_TAGREAD, %g5	! read the tag from DTLB_0
+	stx	%g5, [%g1 + CH_TLO_TTE_TAG]	! way 0 and store it away
+
+	ldxa	[%g7]ASI_DTLB_ACCESS, %g5	! now repeat for DTLB_1 way 0
+	stx	%g5, [%g1 + (CH_TLO_TTE_DATA + (CH_TLO_TTE_SIZE * 2))]
+	ldxa	[%g7]ASI_DTLB_TAGREAD, %g5
+	stx	%g5, [%g1 + (CH_TLO_TTE_TAG + (CH_TLO_TTE_SIZE * 2))]
+
+	set	PN_TLB_ACC_WAY_BIT, %g6		! same thing again for way 1
+	or	%g4, %g6, %g4			! of each TLB.
+	or	%g7, %g6, %g7
+	add	%g1, CH_TLO_TTE_SIZE, %g1	! move up the pointer
+
+	ldxa	[%g4]ASI_DTLB_ACCESS, %g5	! read the data from DTLB_0
+	stx	%g5, [%g1 + CH_TLO_TTE_DATA]	! way 1 and store it away
+	ldxa	[%g4]ASI_DTLB_TAGREAD, %g5	! read the tag from DTLB_0
+	stx	%g5, [%g1 + CH_TLO_TTE_TAG]	! way 1 and store it away
+
+	ldxa	[%g7]ASI_DTLB_ACCESS, %g5	! now repeat for DTLB_1 way 1
+	stx	%g5, [%g1 + (CH_TLO_TTE_DATA + (CH_TLO_TTE_SIZE * 2))]
+	ldxa	[%g7]ASI_DTLB_TAGREAD, %g5
+	stx	%g5, [%g1 + (CH_TLO_TTE_TAG + (CH_TLO_TTE_SIZE * 2))]
+
+	andn	%g4, %g6, %g4			! back to way 0
+	andn	%g7, %g6, %g7			! back to way 0
+
+dtlb_parity_trap_2:
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *    %g4 - contains the T512_0 access index value for the
+	 *          VA/PgSz in question
+	 *    %g7 - contains the T512_1 access index value for the
+	 *          VA/PgSz in question
+	 *
+	 * Here we will clear the errors from the DTLB.
+	 */
+	set	MMU_TAG_ACCESS, %g5		! We write a TTE tag value of
+	stxa	%g0, [%g5]ASI_DMMU		! 0 as it will be invalid.
+	stxa	%g0, [%g4]ASI_DTLB_ACCESS	! Write the data and tag.
+	stxa	%g0, [%g7]ASI_DTLB_ACCESS	! Now repeat for DTLB_1 way 0
+	membar	#Sync
+
+	set	PN_TLB_ACC_WAY_BIT, %g6		! same thing again for way 1
+	or	%g4, %g6, %g4
+	or	%g7, %g6, %g7
+
+	stxa	%g0, [%g4]ASI_DTLB_ACCESS	! Write same data and tag.
+	stxa	%g0, [%g7]ASI_DTLB_ACCESS	! Now repeat for DTLB_1 way 0
+	membar	#Sync
+
+	sethi	%hi(FLUSH_ADDR), %g6		! PRM says we need to issue a
+	flush   %g6				! flush after writing MMU regs
+
+	/*
+	 * at this point:
+	 *    %g2 - contains the VA whose lookup caused the trap
+	 *    %g3 - contains the tlo_info field
+	 *
+	 * Call cpu_tlb_parity_error via systrap at PIL 14 unless we're
+	 * already at PIL 15. We do this even for TL>1 traps since
+	 * those will lead to a system panic.
+	 */
+	set	cpu_tlb_parity_error, %g1
+	rdpr	%pil, %g4
+	cmp	%g4, PIL_14
+	movl	%icc, PIL_14, %g4
+	ba	sys_trap
+	  nop
+	SET_SIZE(dtlb_parity_trap)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * Calculates the Panther TLB index based on a virtual address and page size
+ *
+ * Register usage:
+ *	%o0 - virtual address whose index we want
+ *	%o1 - Page Size of the TLB in question as encoded in the
+ *	      ASI_[D|I]MMU_TAG_ACCESS_EXT register.
+ */
+uint64_t
+pn_get_tlb_index(uint64_t va, uint64_t pg_sz)
+{
+	return ((va + pg_sz)-(va + pg_sz));
+}
+#else	/* lint */
+	ENTRY(pn_get_tlb_index)
+
+	PN_GET_TLB_INDEX(%o0, %o1)
+
+	retl
+	  nop
+	SET_SIZE(pn_get_tlb_index)
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * For Panther CPUs we need to flush the IPB after any I$ or D$
+ * parity errors are detected.
+ */
+void
+flush_ipb(void)
+{ return; }
+
+#else	/* lint */
+
+	ENTRY(flush_ipb)
+	clr	%o0
+
+flush_ipb_1:
+	stxa	%g0, [%o0]ASI_IPB_TAG
+	membar	#Sync
+	cmp	%o0, PN_IPB_TAG_ADDR_MAX
+	blt	flush_ipb_1
+	  add	%o0, PN_IPB_TAG_ADDR_LINESIZE, 	%o0
+
+	sethi	%hi(FLUSH_ADDR), %o0
+	flush   %o0
+	retl
+	nop
+	SET_SIZE(flush_ipb)
+
+#endif	/* lint */
diff --git a/usr/src/uts/sun4u/cpu/us3_common.c b/usr/src/uts/sun4u/cpu/us3_common.c
new file mode 100644
index 0000000000..93e956f2c6
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_common.c
@@ -0,0 +1,6863 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/ddi.h>
+#include <sys/sysmacros.h>
+#include <sys/archsystm.h>
+#include <sys/vmsystm.h>
+#include <sys/machparam.h>
+#include <sys/machsystm.h>
+#include <sys/machthread.h>
+#include <sys/cpu.h>
+#include <sys/cmp.h>
+#include <sys/elf_SPARC.h>
+#include <vm/vm_dep.h>
+#include <vm/hat_sfmmu.h>
+#include <vm/seg_kpm.h>
+#include <sys/cpuvar.h>
+#include <sys/cheetahregs.h>
+#include <sys/us3_module.h>
+#include <sys/async.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dditypes.h>
+#include <sys/prom_debug.h>
+#include <sys/prom_plat.h>
+#include <sys/cpu_module.h>
+#include <sys/sysmacros.h>
+#include <sys/intreg.h>
+#include <sys/clock.h>
+#include <sys/platform_module.h>
+#include <sys/machtrap.h>
+#include <sys/ontrap.h>
+#include <sys/panic.h>
+#include <sys/memlist.h>
+#include <sys/bootconf.h>
+#include <sys/ivintr.h>
+#include <sys/atomic.h>
+#include <sys/taskq.h>
+#include <sys/note.h>
+#include <sys/ndifm.h>
+#include <sys/ddifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/fm/cpu/UltraSPARC-III.h>
+#include <sys/fpras_impl.h>
+#include <sys/dtrace.h>
+#include <sys/watchpoint.h>
+#include <sys/plat_ecc_unum.h>
+#include <sys/cyclic.h>
+#include <sys/errorq.h>
+#include <sys/errclassify.h>
+
+#ifdef	CHEETAHPLUS_ERRATUM_25
+#include <sys/xc_impl.h>
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+
+/*
+ * Note that 'Cheetah PRM' refers to:
+ *   SPARC V9 JPS1 Implementation Supplement: Sun UltraSPARC-III
+ */
+
+/*
+ * Per CPU pointers to physical address of TL>0 logout data areas.
+ * These pointers have to be in the kernel nucleus to avoid MMU
+ * misses.
+ */
+uint64_t ch_err_tl1_paddrs[NCPU];
+
+/*
+ * One statically allocated structure to use during startup/DR
+ * to prevent unnecessary panics.
+ */
+ch_err_tl1_data_t ch_err_tl1_data;
+
+/*
+ * Per CPU pending error at TL>0, used by level15 softint handler
+ */
+uchar_t ch_err_tl1_pending[NCPU];
+
+/*
+ * For deferred CE re-enable after trap.
+ */
+taskq_t		*ch_check_ce_tq;
+
+/*
+ * Internal functions.
+ */
+static int cpu_async_log_err(void *flt, errorq_elem_t *eqep);
+static void cpu_log_diag_info(ch_async_flt_t *ch_flt);
+static void cpu_queue_one_event(ch_async_flt_t *ch_flt, char *reason,
+    ecc_type_to_info_t *eccp, ch_diag_data_t *cdp);
+static int clear_ecc(struct async_flt *ecc);
+#if defined(CPU_IMP_ECACHE_ASSOC)
+static int cpu_ecache_line_valid(ch_async_flt_t *ch_flt);
+#endif
+static int cpu_ecache_set_size(struct cpu *cp);
+static int cpu_ectag_line_invalid(int cachesize, uint64_t tag);
+static int cpu_ectag_pa_to_subblk(int cachesize, uint64_t subaddr);
+static uint64_t cpu_ectag_to_pa(int setsize, uint64_t tag);
+static int cpu_ectag_pa_to_subblk_state(int cachesize,
+				uint64_t subaddr, uint64_t tag);
+static void cpu_flush_ecache_line(ch_async_flt_t *ch_flt);
+static int afsr_to_afar_status(uint64_t afsr, uint64_t afsr_bit);
+static int afsr_to_esynd_status(uint64_t afsr, uint64_t afsr_bit);
+static int afsr_to_msynd_status(uint64_t afsr, uint64_t afsr_bit);
+static int afsr_to_synd_status(uint_t cpuid, uint64_t afsr, uint64_t afsr_bit);
+static int synd_to_synd_code(int synd_status, ushort_t synd, uint64_t afsr_bit);
+static void cpu_uninit_ecache_scrub_dr(struct cpu *cp);
+static void cpu_scrubphys(struct async_flt *aflt);
+static void cpu_payload_add_aflt(struct async_flt *, nvlist_t *, nvlist_t *,
+    int *, int *);
+static void cpu_payload_add_ecache(struct async_flt *, nvlist_t *);
+static void cpu_ereport_init(struct async_flt *aflt);
+static int cpu_check_secondary_errors(ch_async_flt_t *, uint64_t, uint64_t);
+static uint8_t cpu_flt_bit_to_plat_error(struct async_flt *aflt);
+static void cpu_log_fast_ecc_error(caddr_t tpc, int priv, int tl, uint64_t ceen,
+    ch_cpu_logout_t *clop);
+static int cpu_ce_delayed_ec_logout(uint64_t);
+static int cpu_matching_ecache_line(uint64_t, void *, int, int *);
+
+#ifdef	CHEETAHPLUS_ERRATUM_25
+static int mondo_recover_proc(uint16_t, int);
+static void cheetah_nudge_init(void);
+static void cheetah_nudge_onln(void *arg, cpu_t *cpu, cyc_handler_t *hdlr,
+    cyc_time_t *when);
+static void cheetah_nudge_buddy(void);
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+static void cpu_dcache_parity_info(ch_async_flt_t *ch_flt);
+static void cpu_dcache_parity_check(ch_async_flt_t *ch_flt, int index);
+static void cpu_record_dc_data_parity(ch_async_flt_t *ch_flt,
+    ch_dc_data_t *dest_dcp, ch_dc_data_t *src_dcp, int way, int word);
+static void cpu_icache_parity_info(ch_async_flt_t *ch_flt);
+static void cpu_icache_parity_check(ch_async_flt_t *ch_flt, int index);
+static void cpu_pcache_parity_info(ch_async_flt_t *ch_flt);
+static void cpu_pcache_parity_check(ch_async_flt_t *ch_flt, int index);
+static void cpu_payload_add_dcache(struct async_flt *, nvlist_t *);
+static void cpu_payload_add_icache(struct async_flt *, nvlist_t *);
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+
+int (*p2get_mem_info)(int synd_code, uint64_t paddr,
+    uint64_t *mem_sizep, uint64_t *seg_sizep, uint64_t *bank_sizep,
+    int *segsp, int *banksp, int *mcidp);
+
+/*
+ * This table is used to determine which bit(s) is(are) bad when an ECC
+ * error occurs.  The array is indexed by an 9-bit syndrome.  The entries
+ * of this array have the following semantics:
+ *
+ *      00-127  The number of the bad bit, when only one bit is bad.
+ *      128     ECC bit C0 is bad.
+ *      129     ECC bit C1 is bad.
+ *      130     ECC bit C2 is bad.
+ *      131     ECC bit C3 is bad.
+ *      132     ECC bit C4 is bad.
+ *      133     ECC bit C5 is bad.
+ *      134     ECC bit C6 is bad.
+ *      135     ECC bit C7 is bad.
+ *      136     ECC bit C8 is bad.
+ *	137-143 reserved for Mtag Data and ECC.
+ *      144(M2) Two bits are bad within a nibble.
+ *      145(M3) Three bits are bad within a nibble.
+ *      146(M3) Four bits are bad within a nibble.
+ *      147(M)  Multiple bits (5 or more) are bad.
+ *      148     NO bits are bad.
+ * Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-4,11-5.
+ */
+
+#define	C0	128
+#define	C1	129
+#define	C2	130
+#define	C3	131
+#define	C4	132
+#define	C5	133
+#define	C6	134
+#define	C7	135
+#define	C8	136
+#define	MT0	137	/* Mtag Data bit 0 */
+#define	MT1	138
+#define	MT2	139
+#define	MTC0	140	/* Mtag Check bit 0 */
+#define	MTC1	141
+#define	MTC2	142
+#define	MTC3	143
+#define	M2	144
+#define	M3	145
+#define	M4	146
+#define	M	147
+#define	NA	148
+#if defined(JALAPENO) || defined(SERRANO)
+#define	S003	149	/* Syndrome 0x003 => likely from CPU/EDU:ST/FRU/BP */
+#define	S003MEM	150	/* Syndrome 0x003 => likely from WDU/WBP */
+#define	SLAST	S003MEM	/* last special syndrome */
+#else /* JALAPENO || SERRANO */
+#define	S003	149	/* Syndrome 0x003 => likely from EDU:ST */
+#define	S071	150	/* Syndrome 0x071 => likely from WDU/CPU */
+#define	S11C	151	/* Syndrome 0x11c => likely from BERR/DBERR */
+#define	SLAST	S11C	/* last special syndrome */
+#endif /* JALAPENO || SERRANO */
+#if defined(JALAPENO) || defined(SERRANO)
+#define	BPAR0	152	/* syndrom 152 through 167 for bus parity */
+#define	BPAR15	167
+#endif	/* JALAPENO || SERRANO */
+
+static uint8_t ecc_syndrome_tab[] =
+{
+NA,  C0,  C1, S003, C2,  M2,  M3,  47,  C3,  M2,  M2,  53,  M2,  41,  29,   M,
+C4,   M,   M,  50,  M2,  38,  25,  M2,  M2,  33,  24,  M2,  11,   M,  M2,  16,
+C5,   M,   M,  46,  M2,  37,  19,  M2,   M,  31,  32,   M,   7,  M2,  M2,  10,
+M2,  40,  13,  M2,  59,   M,  M2,  66,   M,  M2,  M2,   0,  M2,  67,  71,   M,
+C6,   M,   M,  43,   M,  36,  18,   M,  M2,  49,  15,   M,  63,  M2,  M2,   6,
+M2,  44,  28,  M2,   M,  M2,  M2,  52,  68,  M2,  M2,  62,  M2,  M3,  M3,  M4,
+M2,  26, 106,  M2,  64,   M,  M2,   2, 120,   M,  M2,  M3,   M,  M3,  M3,  M4,
+#if defined(JALAPENO) || defined(SERRANO)
+116, M2,  M2,  M3,  M2,  M3,   M,  M4,  M2,  58,  54,  M2,   M,  M4,  M4,  M3,
+#else	/* JALAPENO || SERRANO */
+116, S071, M2,  M3,  M2,  M3,   M,  M4,  M2,  58,  54,  M2,   M,  M4,  M4,  M3,
+#endif	/* JALAPENO || SERRANO */
+C7,  M2,   M,  42,   M,  35,  17,  M2,   M,  45,  14,  M2,  21,  M2,  M2,   5,
+M,   27,   M,   M,  99,   M,   M,   3, 114,  M2,  M2,  20,  M2,  M3,  M3,   M,
+M2,  23, 113,  M2, 112,  M2,   M,  51,  95,   M,  M2,  M3,  M2,  M3,  M3,  M2,
+103,  M,  M2,  M3,  M2,  M3,  M3,  M4,  M2,  48,   M,   M,  73,  M2,   M,  M3,
+M2,  22, 110,  M2, 109,  M2,   M,   9, 108,  M2,   M,  M3,  M2,  M3,  M3,   M,
+102, M2,   M,   M,  M2,  M3,  M3,   M,  M2,  M3,  M3,  M2,   M,  M4,   M,  M3,
+98,   M,  M2,  M3,  M2,   M,  M3,  M4,  M2,  M3,  M3,  M4,  M3,   M,   M,   M,
+M2,  M3,  M3,   M,  M3,   M,   M,   M,  56,  M4,   M,  M3,  M4,   M,   M,   M,
+C8,   M,  M2,  39,   M,  34, 105,  M2,   M,  30, 104,   M, 101,   M,   M,   4,
+#if defined(JALAPENO) || defined(SERRANO)
+M,    M, 100,   M,  83,   M,  M2,  12,  87,   M,   M,  57,  M2,   M,  M3,   M,
+#else	/* JALAPENO || SERRANO */
+M,    M, 100,   M,  83,   M,  M2,  12,  87,   M,   M,  57, S11C,  M,  M3,   M,
+#endif	/* JALAPENO || SERRANO */
+M2,  97,  82,  M2,  78,  M2,  M2,   1,  96,   M,   M,   M,   M,   M,  M3,  M2,
+94,   M,  M2,  M3,  M2,   M,  M3,   M,  M2,   M,  79,   M,  69,   M,  M4,   M,
+M2,  93,  92,   M,  91,   M,  M2,   8,  90,  M2,  M2,   M,   M,   M,   M,  M4,
+89,   M,   M,  M3,  M2,  M3,  M3,   M,   M,   M,  M3,  M2,  M3,  M2,   M,  M3,
+86,   M,  M2,  M3,  M2,   M,  M3,   M,  M2,   M,  M3,   M,  M3,   M,   M,  M3,
+M,    M,  M3,  M2,  M3,  M2,  M4,   M,  60,   M,  M2,  M3,  M4,   M,   M,  M2,
+M2,  88,  85,  M2,  84,   M,  M2,  55,  81,  M2,  M2,  M3,  M2,  M3,  M3,  M4,
+77,   M,   M,   M,  M2,  M3,   M,   M,  M2,  M3,  M3,  M4,  M3,  M2,   M,   M,
+74,   M,  M2,  M3,   M,   M,  M3,   M,   M,   M,  M3,   M,  M3,   M,  M4,  M3,
+M2,  70, 107,  M4,  65,  M2,  M2,   M, 127,   M,   M,   M,  M2,  M3,  M3,   M,
+80,  M2,  M2,  72,   M, 119, 118,   M,  M2, 126,  76,   M, 125,   M,  M4,  M3,
+M2, 115, 124,   M,  75,   M,   M,  M3,  61,   M,  M4,   M,  M4,   M,   M,   M,
+M,  123, 122,  M4, 121,  M4,   M,  M3, 117,  M2,  M2,  M3,  M4,  M3,   M,   M,
+111,  M,   M,   M,  M4,  M3,  M3,   M,   M,   M,  M3,   M,  M3,  M2,   M,   M
+};
+
+#define	ESYND_TBL_SIZE	(sizeof (ecc_syndrome_tab) / sizeof (uint8_t))
+
+#if !(defined(JALAPENO) || defined(SERRANO))
+/*
+ * This table is used to determine which bit(s) is(are) bad when a Mtag
+ * error occurs.  The array is indexed by an 4-bit ECC syndrome. The entries
+ * of this array have the following semantics:
+ *
+ *      -1	Invalid mtag syndrome.
+ *      137     Mtag Data 0 is bad.
+ *      138     Mtag Data 1 is bad.
+ *      139     Mtag Data 2 is bad.
+ *      140     Mtag ECC 0 is bad.
+ *      141     Mtag ECC 1 is bad.
+ *      142     Mtag ECC 2 is bad.
+ *      143     Mtag ECC 3 is bad.
+ * Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-6.
+ */
+short mtag_syndrome_tab[] =
+{
+NA, MTC0, MTC1, M2, MTC2, M2, M2, MT0, MTC3, M2, M2,  MT1, M2, MT2, M2, M2
+};
+
+#define	MSYND_TBL_SIZE	(sizeof (mtag_syndrome_tab) / sizeof (short))
+
+#else /* !(JALAPENO || SERRANO) */
+
+#define	BSYND_TBL_SIZE	16
+
+#endif /* !(JALAPENO || SERRANO) */
+
+/*
+ * CE initial classification and subsequent action lookup table
+ */
+static ce_dispact_t ce_disp_table[CE_INITDISPTBL_SIZE];
+static int ce_disp_inited;
+
+/*
+ * Set to disable leaky and partner check for memory correctables
+ */
+int ce_xdiag_off;
+
+/*
+ * The following are not incremented atomically so are indicative only
+ */
+static int ce_xdiag_drops;
+static int ce_xdiag_lkydrops;
+static int ce_xdiag_ptnrdrops;
+static int ce_xdiag_bad;
+
+/*
+ * CE leaky check callback structure
+ */
+typedef struct {
+	struct async_flt *lkycb_aflt;
+	errorq_t *lkycb_eqp;
+	errorq_elem_t *lkycb_eqep;
+} ce_lkychk_cb_t;
+
+/*
+ * defines for various ecache_flush_flag's
+ */
+#define	ECACHE_FLUSH_LINE	1
+#define	ECACHE_FLUSH_ALL	2
+
+/*
+ * STICK sync
+ */
+#define	STICK_ITERATION 10
+#define	MAX_TSKEW	1
+#define	EV_A_START	0
+#define	EV_A_END	1
+#define	EV_B_START	2
+#define	EV_B_END	3
+#define	EVENTS		4
+
+static int64_t stick_iter = STICK_ITERATION;
+static int64_t stick_tsk = MAX_TSKEW;
+
+typedef enum {
+	EVENT_NULL = 0,
+	SLAVE_START,
+	SLAVE_CONT,
+	MASTER_START
+} event_cmd_t;
+
+static volatile event_cmd_t stick_sync_cmd = EVENT_NULL;
+static int64_t timestamp[EVENTS];
+static volatile int slave_done;
+
+#ifdef DEBUG
+#define	DSYNC_ATTEMPTS 64
+typedef struct {
+	int64_t	skew_val[DSYNC_ATTEMPTS];
+} ss_t;
+
+ss_t stick_sync_stats[NCPU];
+#endif /* DEBUG */
+
+/*
+ * Maximum number of contexts for Cheetah.
+ */
+#define	MAX_NCTXS	(1 << 13)
+
+/* Will be set !NULL for Cheetah+ and derivatives. */
+uchar_t *ctx_pgsz_array = NULL;
+#if defined(CPU_IMP_DUAL_PAGESIZE)
+static uchar_t ctx_pgsz_arr[MAX_NCTXS];
+uint_t disable_dual_pgsz = 0;
+#endif	/* CPU_IMP_DUAL_PAGESIZE */
+
+/*
+ * Save the cache bootup state for use when internal
+ * caches are to be re-enabled after an error occurs.
+ */
+uint64_t cache_boot_state;
+
+/*
+ * PA[22:0] represent Displacement in Safari configuration space.
+ */
+uint_t	root_phys_addr_lo_mask = 0x7fffffu;
+
+bus_config_eclk_t bus_config_eclk[] = {
+#if defined(JALAPENO) || defined(SERRANO)
+	{JBUS_CONFIG_ECLK_1_DIV, JBUS_CONFIG_ECLK_1},
+	{JBUS_CONFIG_ECLK_2_DIV, JBUS_CONFIG_ECLK_2},
+	{JBUS_CONFIG_ECLK_32_DIV, JBUS_CONFIG_ECLK_32},
+#else /* JALAPENO || SERRANO */
+	{SAFARI_CONFIG_ECLK_1_DIV, SAFARI_CONFIG_ECLK_1},
+	{SAFARI_CONFIG_ECLK_2_DIV, SAFARI_CONFIG_ECLK_2},
+	{SAFARI_CONFIG_ECLK_32_DIV, SAFARI_CONFIG_ECLK_32},
+#endif /* JALAPENO || SERRANO */
+	{0, 0}
+};
+
+/*
+ * Interval for deferred CEEN reenable
+ */
+int cpu_ceen_delay_secs = CPU_CEEN_DELAY_SECS;
+
+/*
+ * set in /etc/system to control logging of user BERR/TO's
+ */
+int cpu_berr_to_verbose = 0;
+
+/*
+ * set to 0 in /etc/system to defer CEEN reenable for all CEs
+ */
+uint64_t cpu_ce_not_deferred = CPU_CE_NOT_DEFERRED;
+uint64_t cpu_ce_not_deferred_ext = CPU_CE_NOT_DEFERRED_EXT;
+
+/*
+ * Set of all offline cpus
+ */
+cpuset_t cpu_offline_set;
+
+static void cpu_delayed_check_ce_errors(void *);
+static void cpu_check_ce_errors(void *);
+void cpu_error_ecache_flush(ch_async_flt_t *);
+static int cpu_error_ecache_flush_required(ch_async_flt_t *);
+static void cpu_log_and_clear_ce(ch_async_flt_t *);
+void cpu_ce_detected(ch_cpu_errors_t *, int);
+
+/*
+ * CE Leaky check timeout in microseconds.  This is chosen to be twice the
+ * memory refresh interval of current DIMMs (64ms).  After initial fix that
+ * gives at least one full refresh cycle in which the cell can leak
+ * (whereafter further refreshes simply reinforce any incorrect bit value).
+ */
+clock_t cpu_ce_lkychk_timeout_usec = 128000;
+
+/*
+ * CE partner check partner caching period in seconds
+ */
+int cpu_ce_ptnr_cachetime_sec = 60;
+
+/*
+ * Sets trap table entry ttentry by overwriting eight instructions from ttlabel
+ */
+#define	CH_SET_TRAP(ttentry, ttlabel)			\
+		bcopy((const void *)&ttlabel, &ttentry, 32);		\
+		flush_instr_mem((caddr_t)&ttentry, 32);
+
+static int min_ecache_size;
+static uint_t priv_hcl_1;
+static uint_t priv_hcl_2;
+static uint_t priv_hcl_4;
+static uint_t priv_hcl_8;
+
+void
+cpu_setup(void)
+{
+	extern int at_flags;
+	extern int disable_delay_tlb_flush, delay_tlb_flush;
+	extern int cpc_has_overflow_intr;
+	extern int disable_text_largepages;
+	extern int use_text_pgsz4m;
+
+	/*
+	 * Setup chip-specific trap handlers.
+	 */
+	cpu_init_trap();
+
+	cache |= (CACHE_VAC | CACHE_PTAG | CACHE_IOCOHERENT);
+
+	at_flags = EF_SPARC_32PLUS | EF_SPARC_SUN_US1 | EF_SPARC_SUN_US3;
+
+	/*
+	 * save the cache bootup state.
+	 */
+	cache_boot_state = get_dcu() & DCU_CACHE;
+
+	/*
+	 * Use the maximum number of contexts available for Cheetah
+	 * unless it has been tuned for debugging.
+	 * We are checking against 0 here since this value can be patched
+	 * while booting.  It can not be patched via /etc/system since it
+	 * will be patched too late and thus cause the system to panic.
+	 */
+	if (nctxs == 0)
+		nctxs = MAX_NCTXS;
+
+	/*
+	 * Due to the number of entries in the fully-associative tlb
+	 * this may have to be tuned lower than in spitfire.
+	 */
+	pp_slots = MIN(8, MAXPP_SLOTS);
+
+	/*
+	 * Block stores do not invalidate all pages of the d$, pagecopy
+	 * et. al. need virtual translations with virtual coloring taken
+	 * into consideration.  prefetch/ldd will pollute the d$ on the
+	 * load side.
+	 */
+	pp_consistent_coloring = PPAGE_STORE_VCOLORING | PPAGE_LOADS_POLLUTE;
+
+	if (use_page_coloring) {
+		do_pg_coloring = 1;
+		if (use_virtual_coloring)
+			do_virtual_coloring = 1;
+	}
+
+	isa_list =
+	    "sparcv9+vis2 sparcv9+vis sparcv9 "
+	    "sparcv8plus+vis2 sparcv8plus+vis sparcv8plus "
+	    "sparcv8 sparcv8-fsmuld sparcv7 sparc";
+
+	/*
+	 * On Panther-based machines, this should
+	 * also include AV_SPARC_POPC too
+	 */
+	cpu_hwcap_flags = AV_SPARC_VIS | AV_SPARC_VIS2;
+
+	/*
+	 * On cheetah, there's no hole in the virtual address space
+	 */
+	hole_start = hole_end = 0;
+
+	/*
+	 * The kpm mapping window.
+	 * kpm_size:
+	 *	The size of a single kpm range.
+	 *	The overall size will be: kpm_size * vac_colors.
+	 * kpm_vbase:
+	 *	The virtual start address of the kpm range within the kernel
+	 *	virtual address space. kpm_vbase has to be kpm_size aligned.
+	 */
+	kpm_size = (size_t)(8ull * 1024 * 1024 * 1024 * 1024); /* 8TB */
+	kpm_size_shift = 43;
+	kpm_vbase = (caddr_t)0x8000000000000000ull; /* 8EB */
+	kpm_smallpages = 1;
+
+	/*
+	 * The traptrace code uses either %tick or %stick for
+	 * timestamping.  We have %stick so we can use it.
+	 */
+	traptrace_use_stick = 1;
+
+	/*
+	 * Cheetah has a performance counter overflow interrupt
+	 */
+	cpc_has_overflow_intr = 1;
+
+	/*
+	 * Use cheetah flush-all support
+	 */
+	if (!disable_delay_tlb_flush)
+		delay_tlb_flush = 1;
+
+#if defined(CPU_IMP_DUAL_PAGESIZE)
+	/*
+	 * Use Cheetah+ and later dual page size support.
+	 */
+	if (!disable_dual_pgsz) {
+		ctx_pgsz_array = ctx_pgsz_arr;
+	}
+#endif	/* CPU_IMP_DUAL_PAGESIZE */
+
+	/*
+	 * Declare that this architecture/cpu combination does fpRAS.
+	 */
+	fpras_implemented = 1;
+
+	/*
+	 * Enable 4M pages to be used for mapping user text by default.  Don't
+	 * use large pages for initialized data segments since we may not know
+	 * at exec() time what should be the preferred large page size for DTLB
+	 * programming.
+	 */
+	use_text_pgsz4m = 1;
+	disable_text_largepages = (1 << TTE64K) | (1 << TTE512K) |
+	    (1 << TTE32M) | (1 << TTE256M);
+
+	/*
+	 * Setup CE lookup table
+	 */
+	CE_INITDISPTBL_POPULATE(ce_disp_table);
+	ce_disp_inited = 1;
+}
+
+/*
+ * Called by setcpudelay
+ */
+void
+cpu_init_tick_freq(void)
+{
+	/*
+	 * For UltraSPARC III and beyond we want to use the
+	 * system clock rate as the basis for low level timing,
+	 * due to support of mixed speed CPUs and power managment.
+	 */
+	if (system_clock_freq == 0)
+		cmn_err(CE_PANIC, "setcpudelay: invalid system_clock_freq");
+
+	sys_tick_freq = system_clock_freq;
+}
+
+#ifdef CHEETAHPLUS_ERRATUM_25
+/*
+ * Tunables
+ */
+int cheetah_bpe_off = 0;
+int cheetah_sendmondo_recover = 1;
+int cheetah_sendmondo_fullscan = 0;
+int cheetah_sendmondo_recover_delay = 5;
+
+#define	CHEETAH_LIVELOCK_MIN_DELAY	1
+
+/*
+ * Recovery Statistics
+ */
+typedef struct cheetah_livelock_entry	{
+	int cpuid;		/* fallen cpu */
+	int buddy;		/* cpu that ran recovery */
+	clock_t lbolt;		/* when recovery started */
+	hrtime_t recovery_time;	/* time spent in recovery */
+} cheetah_livelock_entry_t;
+
+#define	CHEETAH_LIVELOCK_NENTRY	32
+
+cheetah_livelock_entry_t cheetah_livelock_hist[CHEETAH_LIVELOCK_NENTRY];
+int cheetah_livelock_entry_nxt;
+
+#define	CHEETAH_LIVELOCK_ENTRY_NEXT(statp)	{			\
+	statp = cheetah_livelock_hist + cheetah_livelock_entry_nxt;	\
+	if (++cheetah_livelock_entry_nxt >= CHEETAH_LIVELOCK_NENTRY) {	\
+		cheetah_livelock_entry_nxt = 0;				\
+	}								\
+}
+
+#define	CHEETAH_LIVELOCK_ENTRY_SET(statp, item, val)	statp->item = val
+
+struct {
+	hrtime_t hrt;		/* maximum recovery time */
+	int recovery;		/* recovered */
+	int full_claimed;	/* maximum pages claimed in full recovery */
+	int proc_entry;		/* attempted to claim TSB */
+	int proc_tsb_scan;	/* tsb scanned */
+	int proc_tsb_partscan;	/* tsb partially scanned */
+	int proc_tsb_fullscan;	/* whole tsb scanned */
+	int proc_claimed;	/* maximum pages claimed in tsb scan */
+	int proc_user;		/* user thread */
+	int proc_kernel;	/* kernel thread */
+	int proc_onflt;		/* bad stack */
+	int proc_cpu;		/* null cpu */
+	int proc_thread;	/* null thread */
+	int proc_proc;		/* null proc */
+	int proc_as;		/* null as */
+	int proc_hat;		/* null hat */
+	int proc_hat_inval;	/* hat contents don't make sense */
+	int proc_hat_busy;	/* hat is changing TSBs */
+	int proc_tsb_reloc;	/* TSB skipped because being relocated */
+	int proc_cnum_bad;	/* cnum out of range */
+	int proc_cnum;		/* last cnum processed */
+	tte_t proc_tte;		/* last tte processed */
+} cheetah_livelock_stat;
+
+#define	CHEETAH_LIVELOCK_STAT(item)	cheetah_livelock_stat.item++
+
+#define	CHEETAH_LIVELOCK_STATSET(item, value)		\
+	cheetah_livelock_stat.item = value
+
+#define	CHEETAH_LIVELOCK_MAXSTAT(item, value)	{	\
+	if (value > cheetah_livelock_stat.item)		\
+		cheetah_livelock_stat.item = value;	\
+}
+
+/*
+ * Attempt to recover a cpu by claiming every cache line as saved
+ * in the TSB that the non-responsive cpu is using. Since we can't
+ * grab any adaptive lock, this is at best an attempt to do so. Because
+ * we don't grab any locks, we must operate under the protection of
+ * on_fault().
+ *
+ * Return 1 if cpuid could be recovered, 0 if failed.
+ */
+int
+mondo_recover_proc(uint16_t cpuid, int bn)
+{
+	label_t ljb;
+	cpu_t *cp;
+	kthread_t *t;
+	proc_t *p;
+	struct as *as;
+	struct hat *hat;
+	short  cnum;
+	struct tsb_info *tsbinfop;
+	struct tsbe *tsbep;
+	caddr_t tsbp;
+	caddr_t end_tsbp;
+	uint64_t paddr;
+	uint64_t idsr;
+	u_longlong_t pahi, palo;
+	int pages_claimed = 0;
+	tte_t tsbe_tte;
+	int tried_kernel_tsb = 0;
+
+	CHEETAH_LIVELOCK_STAT(proc_entry);
+
+	if (on_fault(&ljb)) {
+		CHEETAH_LIVELOCK_STAT(proc_onflt);
+		goto badstruct;
+	}
+
+	if ((cp = cpu[cpuid]) == NULL) {
+		CHEETAH_LIVELOCK_STAT(proc_cpu);
+		goto badstruct;
+	}
+
+	if ((t = cp->cpu_thread) == NULL) {
+		CHEETAH_LIVELOCK_STAT(proc_thread);
+		goto badstruct;
+	}
+
+	if ((p = ttoproc(t)) == NULL) {
+		CHEETAH_LIVELOCK_STAT(proc_proc);
+		goto badstruct;
+	}
+
+	if ((as = p->p_as) == NULL) {
+		CHEETAH_LIVELOCK_STAT(proc_as);
+		goto badstruct;
+	}
+
+	if ((hat = as->a_hat) == NULL) {
+		CHEETAH_LIVELOCK_STAT(proc_hat);
+		goto badstruct;
+	}
+
+	if (hat != ksfmmup) {
+		CHEETAH_LIVELOCK_STAT(proc_user);
+		if (hat->sfmmu_flags & (HAT_BUSY | HAT_SWAPPED | HAT_SWAPIN)) {
+			CHEETAH_LIVELOCK_STAT(proc_hat_busy);
+			goto badstruct;
+		}
+		tsbinfop = hat->sfmmu_tsb;
+		if (tsbinfop == NULL) {
+			CHEETAH_LIVELOCK_STAT(proc_hat_inval);
+			goto badstruct;
+		}
+		tsbp = tsbinfop->tsb_va;
+		end_tsbp = tsbp + TSB_BYTES(tsbinfop->tsb_szc);
+	} else {
+		CHEETAH_LIVELOCK_STAT(proc_kernel);
+		tsbinfop = NULL;
+		tsbp = ktsb_base;
+		end_tsbp = tsbp + TSB_BYTES(ktsb_sz);
+	}
+
+	/* Verify as */
+	if (hat->sfmmu_as != as) {
+		CHEETAH_LIVELOCK_STAT(proc_hat_inval);
+		goto badstruct;
+	}
+
+	cnum = hat->sfmmu_cnum;
+	CHEETAH_LIVELOCK_STATSET(proc_cnum, cnum);
+
+	if ((cnum < 0) || (cnum == INVALID_CONTEXT) || (cnum >= nctxs)) {
+		CHEETAH_LIVELOCK_STAT(proc_cnum_bad);
+		goto badstruct;
+	}
+
+	do {
+		CHEETAH_LIVELOCK_STAT(proc_tsb_scan);
+
+		/*
+		 * Skip TSBs being relocated.  This is important because
+		 * we want to avoid the following deadlock scenario:
+		 *
+		 * 1) when we came in we set ourselves to "in recover" state.
+		 * 2) when we try to touch TSB being relocated the mapping
+		 *    will be in the suspended state so we'll spin waiting
+		 *    for it to be unlocked.
+		 * 3) when the CPU that holds the TSB mapping locked tries to
+		 *    unlock it it will send a xtrap which will fail to xcall
+		 *    us or the CPU we're trying to recover, and will in turn
+		 *    enter the mondo code.
+		 * 4) since we are still spinning on the locked mapping
+		 *    no further progress will be made and the system will
+		 *    inevitably hard hang.
+		 *
+		 * A TSB not being relocated can't begin being relocated
+		 * while we're accessing it because we check
+		 * sendmondo_in_recover before relocating TSBs.
+		 */
+		if (hat != ksfmmup &&
+		    (tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
+			CHEETAH_LIVELOCK_STAT(proc_tsb_reloc);
+			goto next_tsbinfo;
+		}
+
+		for (tsbep = (struct tsbe *)tsbp;
+		    tsbep < (struct tsbe *)end_tsbp; tsbep++) {
+			tsbe_tte = tsbep->tte_data;
+
+			if (tsbe_tte.tte_val == 0) {
+				/*
+				 * Invalid tte
+				 */
+				continue;
+			}
+			if (tsbe_tte.tte_se) {
+				/*
+				 * Don't want device registers
+				 */
+				continue;
+			}
+			if (tsbe_tte.tte_cp == 0) {
+				/*
+				 * Must be cached in E$
+				 */
+				continue;
+			}
+			CHEETAH_LIVELOCK_STATSET(proc_tte, tsbe_tte);
+			idsr = getidsr();
+			if ((idsr & (IDSR_NACK_BIT(bn) |
+			    IDSR_BUSY_BIT(bn))) == 0) {
+				CHEETAH_LIVELOCK_STAT(proc_tsb_partscan);
+				goto done;
+			}
+			pahi = tsbe_tte.tte_pahi;
+			palo = tsbe_tte.tte_palo;
+			paddr = (uint64_t)((pahi << 32) |
+			    (palo << MMU_PAGESHIFT));
+			claimlines(paddr, TTEBYTES(TTE_CSZ(&tsbe_tte)),
+			    CH_ECACHE_SUBBLK_SIZE);
+			if ((idsr & IDSR_BUSY_BIT(bn)) == 0) {
+				shipit(cpuid, bn);
+			}
+			pages_claimed++;
+		}
+next_tsbinfo:
+		if (tsbinfop != NULL)
+			tsbinfop = tsbinfop->tsb_next;
+		if (tsbinfop != NULL) {
+			tsbp = tsbinfop->tsb_va;
+			end_tsbp = tsbp + TSB_BYTES(tsbinfop->tsb_szc);
+		} else if (tsbp == ktsb_base) {
+			tried_kernel_tsb = 1;
+		} else if (!tried_kernel_tsb) {
+			tsbp = ktsb_base;
+			end_tsbp = tsbp + TSB_BYTES(ktsb_sz);
+			hat = ksfmmup;
+			tsbinfop = NULL;
+		}
+	} while (tsbinfop != NULL ||
+			((tsbp == ktsb_base) && !tried_kernel_tsb));
+
+	CHEETAH_LIVELOCK_STAT(proc_tsb_fullscan);
+	CHEETAH_LIVELOCK_MAXSTAT(proc_claimed, pages_claimed);
+	no_fault();
+	idsr = getidsr();
+	if ((idsr & (IDSR_NACK_BIT(bn) |
+	    IDSR_BUSY_BIT(bn))) == 0) {
+		return (1);
+	} else {
+		return (0);
+	}
+
+done:
+	no_fault();
+	CHEETAH_LIVELOCK_MAXSTAT(proc_claimed, pages_claimed);
+	return (1);
+
+badstruct:
+	no_fault();
+	return (0);
+}
+
+/*
+ * Attempt to claim ownership, temporarily, of every cache line that a
+ * non-responsive cpu might be using.  This might kick that cpu out of
+ * this state.
+ *
+ * The return value indicates to the caller if we have exhausted all recovery
+ * techniques. If 1 is returned, it is useless to call this function again
+ * even for a different target CPU.
+ */
+int
+mondo_recover(uint16_t cpuid, int bn)
+{
+	struct memseg *seg;
+	uint64_t begin_pa, end_pa, cur_pa;
+	hrtime_t begin_hrt, end_hrt;
+	int retval = 0;
+	int pages_claimed = 0;
+	cheetah_livelock_entry_t *histp;
+	uint64_t idsr;
+
+	if (cas32(&sendmondo_in_recover, 0, 1) != 0) {
+		/*
+		 * Wait while recovery takes place
+		 */
+		while (sendmondo_in_recover) {
+			drv_usecwait(1);
+		}
+		/*
+		 * Assume we didn't claim the whole memory. If
+		 * the target of this caller is not recovered,
+		 * it will come back.
+		 */
+		return (retval);
+	}
+
+	CHEETAH_LIVELOCK_ENTRY_NEXT(histp)
+	CHEETAH_LIVELOCK_ENTRY_SET(histp, lbolt, lbolt);
+	CHEETAH_LIVELOCK_ENTRY_SET(histp, cpuid, cpuid);
+	CHEETAH_LIVELOCK_ENTRY_SET(histp, buddy, CPU->cpu_id);
+
+	begin_hrt = gethrtime_waitfree();
+	/*
+	 * First try to claim the lines in the TSB the target
+	 * may have been using.
+	 */
+	if (mondo_recover_proc(cpuid, bn) == 1) {
+		/*
+		 * Didn't claim the whole memory
+		 */
+		goto done;
+	}
+
+	/*
+	 * We tried using the TSB. The target is still
+	 * not recovered. Check if complete memory scan is
+	 * enabled.
+	 */
+	if (cheetah_sendmondo_fullscan == 0) {
+		/*
+		 * Full memory scan is disabled.
+		 */
+		retval = 1;
+		goto done;
+	}
+
+	/*
+	 * Try claiming the whole memory.
+	 */
+	for (seg = memsegs; seg; seg = seg->next) {
+		begin_pa = (uint64_t)(seg->pages_base) << MMU_PAGESHIFT;
+		end_pa = (uint64_t)(seg->pages_end) << MMU_PAGESHIFT;
+		for (cur_pa = begin_pa; cur_pa < end_pa;
+		    cur_pa += MMU_PAGESIZE) {
+			idsr = getidsr();
+			if ((idsr & (IDSR_NACK_BIT(bn) |
+			    IDSR_BUSY_BIT(bn))) == 0) {
+				/*
+				 * Didn't claim all memory
+				 */
+				goto done;
+			}
+			claimlines(cur_pa, MMU_PAGESIZE,
+			    CH_ECACHE_SUBBLK_SIZE);
+			if ((idsr & IDSR_BUSY_BIT(bn)) == 0) {
+				shipit(cpuid, bn);
+			}
+			pages_claimed++;
+		}
+	}
+
+	/*
+	 * We did all we could.
+	 */
+	retval = 1;
+
+done:
+	/*
+	 * Update statistics
+	 */
+	end_hrt = gethrtime_waitfree();
+	CHEETAH_LIVELOCK_STAT(recovery);
+	CHEETAH_LIVELOCK_MAXSTAT(hrt, (end_hrt - begin_hrt));
+	CHEETAH_LIVELOCK_MAXSTAT(full_claimed, pages_claimed);
+	CHEETAH_LIVELOCK_ENTRY_SET(histp, recovery_time, \
+	    (end_hrt -  begin_hrt));
+
+	while (cas32(&sendmondo_in_recover, 1, 0) != 1);
+
+	return (retval);
+}
+
+/*
+ * This is called by the cyclic framework when this CPU becomes online
+ */
+/*ARGSUSED*/
+static void
+cheetah_nudge_onln(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
+{
+
+	hdlr->cyh_func = (cyc_func_t)cheetah_nudge_buddy;
+	hdlr->cyh_level = CY_LOW_LEVEL;
+	hdlr->cyh_arg = NULL;
+
+	/*
+	 * Stagger the start time
+	 */
+	when->cyt_when = cpu->cpu_id * (NANOSEC / NCPU);
+	if (cheetah_sendmondo_recover_delay < CHEETAH_LIVELOCK_MIN_DELAY) {
+		cheetah_sendmondo_recover_delay = CHEETAH_LIVELOCK_MIN_DELAY;
+	}
+	when->cyt_interval = cheetah_sendmondo_recover_delay * NANOSEC;
+}
+
+/*
+ * Create a low level cyclic to send a xtrap to the next cpu online.
+ * However, there's no need to have this running on a uniprocessor system.
+ */
+static void
+cheetah_nudge_init(void)
+{
+	cyc_omni_handler_t hdlr;
+
+	if (max_ncpus == 1) {
+		return;
+	}
+
+	hdlr.cyo_online = cheetah_nudge_onln;
+	hdlr.cyo_offline = NULL;
+	hdlr.cyo_arg = NULL;
+
+	mutex_enter(&cpu_lock);
+	(void) cyclic_add_omni(&hdlr);
+	mutex_exit(&cpu_lock);
+}
+
+/*
+ * Cyclic handler to wake up buddy
+ */
+void
+cheetah_nudge_buddy(void)
+{
+	/*
+	 * Disable kernel preemption to protect the cpu list
+	 */
+	kpreempt_disable();
+	if ((CPU->cpu_next_onln != CPU) && (sendmondo_in_recover == 0)) {
+		xt_one(CPU->cpu_next_onln->cpu_id, (xcfunc_t *)xt_sync_tl1,
+		    0, 0);
+	}
+	kpreempt_enable();
+}
+
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+
+#ifdef SEND_MONDO_STATS
+uint32_t x_one_stimes[64];
+uint32_t x_one_ltimes[16];
+uint32_t x_set_stimes[64];
+uint32_t x_set_ltimes[16];
+uint32_t x_set_cpus[NCPU];
+uint32_t x_nack_stimes[64];
+#endif
+
+/*
+ * Note: A version of this function is used by the debugger via the KDI,
+ * and must be kept in sync with this version.  Any changes made to this
+ * function to support new chips or to accomodate errata must also be included
+ * in the KDI-specific version.  See us3_kdi.c.
+ */
+void
+send_one_mondo(int cpuid)
+{
+	int busy, nack;
+	uint64_t idsr, starttick, endtick, tick, lasttick;
+	uint64_t busymask;
+#ifdef	CHEETAHPLUS_ERRATUM_25
+	int recovered = 0;
+#endif
+
+	CPU_STATS_ADDQ(CPU, sys, xcalls, 1);
+	starttick = lasttick = gettick();
+	shipit(cpuid, 0);
+	endtick = starttick + xc_tick_limit;
+	busy = nack = 0;
+#if defined(JALAPENO) || defined(SERRANO)
+	/*
+	 * Lower 2 bits of the agent ID determine which BUSY/NACK pair
+	 * will be used for dispatching interrupt. For now, assume
+	 * there are no more than IDSR_BN_SETS CPUs, hence no aliasing
+	 * issues with respect to BUSY/NACK pair usage.
+	 */
+	busymask  = IDSR_BUSY_BIT(cpuid);
+#else /* JALAPENO || SERRANO */
+	busymask = IDSR_BUSY;
+#endif /* JALAPENO || SERRANO */
+	for (;;) {
+		idsr = getidsr();
+		if (idsr == 0)
+			break;
+
+		tick = gettick();
+		/*
+		 * If there is a big jump between the current tick
+		 * count and lasttick, we have probably hit a break
+		 * point.  Adjust endtick accordingly to avoid panic.
+		 */
+		if (tick > (lasttick + xc_tick_jump_limit))
+			endtick += (tick - lasttick);
+		lasttick = tick;
+		if (tick > endtick) {
+			if (panic_quiesce)
+				return;
+#ifdef	CHEETAHPLUS_ERRATUM_25
+			if (cheetah_sendmondo_recover && recovered == 0) {
+				if (mondo_recover(cpuid, 0)) {
+					/*
+					 * We claimed the whole memory or
+					 * full scan is disabled.
+					 */
+					recovered++;
+				}
+				tick = gettick();
+				endtick = tick + xc_tick_limit;
+				lasttick = tick;
+				/*
+				 * Recheck idsr
+				 */
+				continue;
+			} else
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+			{
+				cmn_err(CE_PANIC, "send mondo timeout "
+				    "(target 0x%x) [%d NACK %d BUSY]",
+				    cpuid, nack, busy);
+			}
+		}
+
+		if (idsr & busymask) {
+			busy++;
+			continue;
+		}
+		drv_usecwait(1);
+		shipit(cpuid, 0);
+		nack++;
+		busy = 0;
+	}
+#ifdef SEND_MONDO_STATS
+	{
+		int n = gettick() - starttick;
+		if (n < 8192)
+			x_one_stimes[n >> 7]++;
+		else
+			x_one_ltimes[(n >> 13) & 0xf]++;
+	}
+#endif
+}
+
+void
+syncfpu(void)
+{
+}
+
+/*
+ * Return processor specific async error structure
+ * size used.
+ */
+int
+cpu_aflt_size(void)
+{
+	return (sizeof (ch_async_flt_t));
+}
+
+/*
+ * The fast_ecc_err handler transfers control here for UCU, UCC events.
+ * Note that we flush Ecache twice, once in the fast_ecc_err handler to
+ * flush the error that caused the UCU/UCC, then again here at the end to
+ * flush the TL=1 trap handler code out of the Ecache, so we can minimize
+ * the probability of getting a TL>1 Fast ECC trap when we're fielding
+ * another Fast ECC trap.
+ *
+ * Cheetah+ also handles: TSCE: No additional processing required.
+ * Panther adds L3_UCU and L3_UCC which are reported in AFSR_EXT.
+ *
+ * Note that the p_clo_flags input is only valid in cases where the
+ * cpu_private struct is not yet initialized (since that is the only
+ * time that information cannot be obtained from the logout struct.)
+ */
+/*ARGSUSED*/
+void
+cpu_fast_ecc_error(struct regs *rp, ulong_t p_clo_flags)
+{
+	ch_cpu_logout_t *clop;
+	uint64_t ceen;
+
+	/*
+	 * Get the CPU log out info. If we can't find our CPU private
+	 * pointer, then we will have to make due without any detailed
+	 * logout information.
+	 */
+	if (CPU_PRIVATE(CPU) == NULL) {
+		clop = NULL;
+		ceen = p_clo_flags & EN_REG_CEEN;
+	} else {
+		clop = CPU_PRIVATE_PTR(CPU, chpr_fecctl0_logout);
+		ceen = clop->clo_flags & EN_REG_CEEN;
+	}
+
+	cpu_log_fast_ecc_error((caddr_t)rp->r_pc,
+	    (rp->r_tstate & TSTATE_PRIV) ? 1 : 0, 0, ceen, clop);
+}
+
+/*
+ * Log fast ecc error, called from either Fast ECC at TL=0 or Fast
+ * ECC at TL>0.  Need to supply either a error register pointer or a
+ * cpu logout structure pointer.
+ */
+static void
+cpu_log_fast_ecc_error(caddr_t tpc, int priv, int tl, uint64_t ceen,
+    ch_cpu_logout_t *clop)
+{
+	struct async_flt *aflt;
+	ch_async_flt_t ch_flt;
+	uint64_t t_afar, t_afsr, t_afsr_ext, t_afsr_errs;
+	char pr_reason[MAX_REASON_STRING];
+	ch_cpu_errors_t cpu_error_regs;
+
+	bzero(&ch_flt, sizeof (ch_async_flt_t));
+	/*
+	 * If no cpu logout data, then we will have to make due without
+	 * any detailed logout information.
+	 */
+	if (clop == NULL) {
+		ch_flt.flt_diag_data.chd_afar = LOGOUT_INVALID;
+		get_cpu_error_state(&cpu_error_regs);
+		set_cpu_error_state(&cpu_error_regs);
+		t_afar = cpu_error_regs.afar;
+		t_afsr = cpu_error_regs.afsr;
+		t_afsr_ext = cpu_error_regs.afsr_ext;
+#if defined(SERRANO)
+		ch_flt.afar2 = cpu_error_regs.afar2;
+#endif	/* SERRANO */
+	} else {
+		t_afar = clop->clo_data.chd_afar;
+		t_afsr = clop->clo_data.chd_afsr;
+		t_afsr_ext = clop->clo_data.chd_afsr_ext;
+#if defined(SERRANO)
+		ch_flt.afar2 = clop->clo_data.chd_afar2;
+#endif	/* SERRANO */
+	}
+
+	/*
+	 * In order to simplify code, we maintain this afsr_errs
+	 * variable which holds the aggregate of AFSR and AFSR_EXT
+	 * sticky bits.
+	 */
+	t_afsr_errs = (t_afsr_ext & C_AFSR_EXT_ALL_ERRS) |
+	    (t_afsr & C_AFSR_ALL_ERRS);
+	pr_reason[0] = '\0';
+
+	/* Setup the async fault structure */
+	aflt = (struct async_flt *)&ch_flt;
+	aflt->flt_id = gethrtime_waitfree();
+	ch_flt.afsr_ext = t_afsr_ext;
+	ch_flt.afsr_errs = t_afsr_errs;
+	aflt->flt_stat = t_afsr;
+	aflt->flt_addr = t_afar;
+	aflt->flt_bus_id = getprocessorid();
+	aflt->flt_inst = CPU->cpu_id;
+	aflt->flt_pc = tpc;
+	aflt->flt_prot = AFLT_PROT_NONE;
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_priv = priv;
+	aflt->flt_tl = tl;
+	aflt->flt_status = ECC_F_TRAP;
+	aflt->flt_panic = C_AFSR_PANIC(t_afsr_errs);
+
+	/*
+	 * XXXX - Phenomenal hack to get around Solaris not getting all the
+	 * cmn_err messages out to the console.  The situation is a UCU (in
+	 * priv mode) which causes a WDU which causes a UE (on the retry).
+	 * The messages for the UCU and WDU are enqueued and then pulled off
+	 * the async queue via softint and syslogd starts to process them
+	 * but doesn't get them to the console.  The UE causes a panic, but
+	 * since the UCU/WDU messages are already in transit, those aren't
+	 * on the async queue.  The hack is to check if we have a matching
+	 * WDU event for the UCU, and if it matches, we're more than likely
+	 * going to panic with a UE, unless we're under protection.  So, we
+	 * check to see if we got a matching WDU event and if we're under
+	 * protection.
+	 *
+	 * For Cheetah/Cheetah+/Jaguar/Jalapeno, the sequence we care about
+	 * looks like this:
+	 *    UCU->WDU->UE
+	 * For Panther, it could look like either of these:
+	 *    UCU---->WDU->L3_WDU->UE
+	 *    L3_UCU->WDU->L3_WDU->UE
+	 */
+	if ((t_afsr_errs & (C_AFSR_UCU | C_AFSR_L3_UCU)) &&
+	    aflt->flt_panic == 0 && aflt->flt_priv != 0 &&
+	    curthread->t_ontrap == NULL && curthread->t_lofault == NULL) {
+		get_cpu_error_state(&cpu_error_regs);
+		aflt->flt_panic |= ((cpu_error_regs.afsr & C_AFSR_WDU) &&
+		    (cpu_error_regs.afar == t_afar));
+		aflt->flt_panic |= ((clop == NULL) &&
+		    (t_afsr_errs & C_AFSR_WDU));
+	}
+
+	/*
+	 * Queue events on the async event queue, one event per error bit.
+	 * If no events are queued or no Fast ECC events are on in the AFSR,
+	 * queue an event to complain.
+	 */
+	if (cpu_queue_events(&ch_flt, pr_reason, t_afsr_errs, clop) == 0 ||
+	    ((t_afsr_errs & (C_AFSR_FECC_ERRS | C_AFSR_EXT_FECC_ERRS)) == 0)) {
+		ch_flt.flt_type = CPU_INV_AFSR;
+		cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_INVALID_AFSR,
+		    (void *)&ch_flt, sizeof (ch_async_flt_t), ue_queue,
+		    aflt->flt_panic);
+	}
+
+	/*
+	 * Zero out + invalidate CPU logout.
+	 */
+	if (clop) {
+		bzero(clop, sizeof (ch_cpu_logout_t));
+		clop->clo_data.chd_afar = LOGOUT_INVALID;
+	}
+
+	/*
+	 * We carefully re-enable NCEEN and CEEN and then check if any deferred
+	 * or disrupting errors have happened.  We do this because if a
+	 * deferred or disrupting error had occurred with NCEEN/CEEN off, the
+	 * trap will not be taken when NCEEN/CEEN is re-enabled.  Note that
+	 * CEEN works differently on Cheetah than on Spitfire.  Also, we enable
+	 * NCEEN/CEEN *before* checking the AFSR to avoid the small window of a
+	 * deferred or disrupting error happening between checking the AFSR and
+	 * enabling NCEEN/CEEN.
+	 *
+	 * Note: CEEN reenabled only if it was on when trap taken.
+	 */
+	set_error_enable(get_error_enable() | (EN_REG_NCEEN | ceen));
+	if (clear_errors(&ch_flt)) {
+		aflt->flt_panic |= ((ch_flt.afsr_errs &
+		    (C_AFSR_EXT_ASYNC_ERRS | C_AFSR_ASYNC_ERRS)) != 0);
+		(void) cpu_queue_events(&ch_flt, pr_reason, ch_flt.afsr_errs,
+		    NULL);
+	}
+
+	/*
+	 * Panic here if aflt->flt_panic has been set.  Enqueued errors will
+	 * be logged as part of the panic flow.
+	 */
+	if (aflt->flt_panic)
+		fm_panic("%sError(s)", pr_reason);
+
+	/*
+	 * Flushing the Ecache here gets the part of the trap handler that
+	 * is run at TL=1 out of the Ecache.
+	 */
+	cpu_flush_ecache();
+}
+
+/*
+ * This is called via sys_trap from pil15_interrupt code if the
+ * corresponding entry in ch_err_tl1_pending is set.  Checks the
+ * various ch_err_tl1_data structures for valid entries based on the bit
+ * settings in the ch_err_tl1_flags entry of the structure.
+ */
+/*ARGSUSED*/
+void
+cpu_tl1_error(struct regs *rp, int panic)
+{
+	ch_err_tl1_data_t *cl1p, cl1;
+	int i, ncl1ps;
+	uint64_t me_flags;
+	uint64_t ceen;
+
+	if (ch_err_tl1_paddrs[CPU->cpu_id] == 0) {
+		cl1p = &ch_err_tl1_data;
+		ncl1ps = 1;
+	} else if (CPU_PRIVATE(CPU) != NULL) {
+		cl1p = CPU_PRIVATE_PTR(CPU, chpr_tl1_err_data[0]);
+		ncl1ps = CH_ERR_TL1_TLMAX;
+	} else {
+		ncl1ps = 0;
+	}
+
+	for (i = 0; i < ncl1ps; i++, cl1p++) {
+		if (cl1p->ch_err_tl1_flags == 0)
+			continue;
+
+		/*
+		 * Grab a copy of the logout data and invalidate
+		 * the logout area.
+		 */
+		cl1 = *cl1p;
+		bzero(cl1p, sizeof (ch_err_tl1_data_t));
+		cl1p->ch_err_tl1_logout.clo_data.chd_afar = LOGOUT_INVALID;
+		me_flags = CH_ERR_ME_FLAGS(cl1.ch_err_tl1_flags);
+
+		/*
+		 * Log "first error" in ch_err_tl1_data.
+		 */
+		if (cl1.ch_err_tl1_flags & CH_ERR_FECC) {
+			ceen = get_error_enable() & EN_REG_CEEN;
+			cpu_log_fast_ecc_error((caddr_t)cl1.ch_err_tl1_tpc, 1,
+			    1, ceen, &cl1.ch_err_tl1_logout);
+		}
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+		if (cl1.ch_err_tl1_flags & (CH_ERR_IPE | CH_ERR_DPE)) {
+			cpu_parity_error(rp, cl1.ch_err_tl1_flags,
+			    (caddr_t)cl1.ch_err_tl1_tpc);
+		}
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+
+		/*
+		 * Log "multiple events" in ch_err_tl1_data.  Note that
+		 * we don't read and clear the AFSR/AFAR in the TL>0 code
+		 * if the structure is busy, we just do the cache flushing
+		 * we have to do and then do the retry.  So the AFSR/AFAR
+		 * at this point *should* have some relevant info.  If there
+		 * are no valid errors in the AFSR, we'll assume they've
+		 * already been picked up and logged.  For I$/D$ parity,
+		 * we just log an event with an "Unknown" (NULL) TPC.
+		 */
+		if (me_flags & CH_ERR_FECC) {
+			ch_cpu_errors_t cpu_error_regs;
+			uint64_t t_afsr_errs;
+
+			/*
+			 * Get the error registers and see if there's
+			 * a pending error.  If not, don't bother
+			 * generating an "Invalid AFSR" error event.
+			 */
+			get_cpu_error_state(&cpu_error_regs);
+			t_afsr_errs = (cpu_error_regs.afsr_ext &
+			    C_AFSR_EXT_ALL_ERRS) |
+			    (cpu_error_regs.afsr & C_AFSR_ALL_ERRS);
+			if (t_afsr_errs != 0) {
+				ceen = get_error_enable() & EN_REG_CEEN;
+				cpu_log_fast_ecc_error((caddr_t)NULL, 1,
+				    1, ceen, NULL);
+			}
+		}
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+		if (me_flags & (CH_ERR_IPE | CH_ERR_DPE)) {
+			cpu_parity_error(rp, me_flags, (caddr_t)NULL);
+		}
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+	}
+}
+
+/*
+ * Called from Fast ECC TL>0 handler in case of fatal error.
+ * cpu_tl1_error should always find an associated ch_err_tl1_data structure,
+ * but if we don't, we'll panic with something reasonable.
+ */
+/*ARGSUSED*/
+void
+cpu_tl1_err_panic(struct regs *rp, ulong_t flags)
+{
+	cpu_tl1_error(rp, 1);
+	/*
+	 * Should never return, but just in case.
+	 */
+	fm_panic("Unsurvivable ECC Error at TL>0");
+}
+
+/*
+ * The ce_err/ce_err_tl1 handlers transfer control here for CE, EMC, EDU:ST,
+ * EDC, WDU, WDC, CPU, CPC, IVU, IVC events.
+ * Disrupting errors controlled by NCEEN: EDU:ST, WDU, CPU, IVU
+ * Disrupting errors controlled by CEEN: CE, EMC, EDC, WDC, CPC, IVC
+ *
+ * Cheetah+ also handles (No additional processing required):
+ *    DUE, DTO, DBERR	(NCEEN controlled)
+ *    THCE		(CEEN and ET_ECC_en controlled)
+ *    TUE		(ET_ECC_en controlled)
+ *
+ * Panther further adds:
+ *    IMU, L3_EDU, L3_WDU, L3_CPU		(NCEEN controlled)
+ *    IMC, L3_EDC, L3_WDC, L3_CPC, L3_THCE	(CEEN controlled)
+ *    TUE_SH, TUE		(NCEEN and L2_tag_ECC_en controlled)
+ *    L3_TUE, L3_TUE_SH		(NCEEN and ET_ECC_en controlled)
+ *    THCE			(CEEN and L2_tag_ECC_en controlled)
+ *    L3_THCE			(CEEN and ET_ECC_en controlled)
+ *
+ * Note that the p_clo_flags input is only valid in cases where the
+ * cpu_private struct is not yet initialized (since that is the only
+ * time that information cannot be obtained from the logout struct.)
+ */
+/*ARGSUSED*/
+void
+cpu_disrupting_error(struct regs *rp, ulong_t p_clo_flags)
+{
+	struct async_flt *aflt;
+	ch_async_flt_t ch_flt;
+	char pr_reason[MAX_REASON_STRING];
+	ch_cpu_logout_t *clop;
+	uint64_t t_afar, t_afsr, t_afsr_ext, t_afsr_errs;
+	ch_cpu_errors_t cpu_error_regs;
+
+	bzero(&ch_flt, sizeof (ch_async_flt_t));
+	/*
+	 * Get the CPU log out info. If we can't find our CPU private
+	 * pointer, then we will have to make due without any detailed
+	 * logout information.
+	 */
+	if (CPU_PRIVATE(CPU) == NULL) {
+		clop = NULL;
+		ch_flt.flt_diag_data.chd_afar = LOGOUT_INVALID;
+		get_cpu_error_state(&cpu_error_regs);
+		set_cpu_error_state(&cpu_error_regs);
+		t_afar = cpu_error_regs.afar;
+		t_afsr = cpu_error_regs.afsr;
+		t_afsr_ext = cpu_error_regs.afsr_ext;
+#if defined(SERRANO)
+		ch_flt.afar2 = cpu_error_regs.afar2;
+#endif	/* SERRANO */
+	} else {
+		clop = CPU_PRIVATE_PTR(CPU, chpr_cecc_logout);
+		t_afar = clop->clo_data.chd_afar;
+		t_afsr = clop->clo_data.chd_afsr;
+		t_afsr_ext = clop->clo_data.chd_afsr_ext;
+#if defined(SERRANO)
+		ch_flt.afar2 = clop->clo_data.chd_afar2;
+#endif	/* SERRANO */
+	}
+
+	/*
+	 * In order to simplify code, we maintain this afsr_errs
+	 * variable which holds the aggregate of AFSR and AFSR_EXT
+	 * sticky bits.
+	 */
+	t_afsr_errs = (t_afsr_ext & C_AFSR_EXT_ALL_ERRS) |
+	    (t_afsr & C_AFSR_ALL_ERRS);
+
+	pr_reason[0] = '\0';
+	/* Setup the async fault structure */
+	aflt = (struct async_flt *)&ch_flt;
+	ch_flt.afsr_ext = t_afsr_ext;
+	ch_flt.afsr_errs = t_afsr_errs;
+	aflt->flt_stat = t_afsr;
+	aflt->flt_addr = t_afar;
+	aflt->flt_pc = (caddr_t)rp->r_pc;
+	aflt->flt_priv = (rp->r_tstate & TSTATE_PRIV) ?  1 : 0;
+	aflt->flt_tl = 0;
+	aflt->flt_panic = C_AFSR_PANIC(t_afsr_errs);
+
+	/*
+	 * If this trap is a result of one of the errors not masked
+	 * by cpu_ce_not_deferred, we don't reenable CEEN. Instead
+	 * indicate that a timeout is to be set later.
+	 */
+	if (!(t_afsr_errs & (cpu_ce_not_deferred | cpu_ce_not_deferred_ext)) &&
+	    !aflt->flt_panic)
+		ch_flt.flt_trapped_ce = CE_CEEN_DEFER | CE_CEEN_TRAPPED;
+	else
+		ch_flt.flt_trapped_ce = CE_CEEN_NODEFER | CE_CEEN_TRAPPED;
+
+	/*
+	 * log the CE and clean up
+	 */
+	cpu_log_and_clear_ce(&ch_flt);
+
+	/*
+	 * We re-enable CEEN (if required) and check if any disrupting errors
+	 * have happened.  We do this because if a disrupting error had occurred
+	 * with CEEN off, the trap will not be taken when CEEN is re-enabled.
+	 * Note that CEEN works differently on Cheetah than on Spitfire.  Also,
+	 * we enable CEEN *before* checking the AFSR to avoid the small window
+	 * of a error happening between checking the AFSR and enabling CEEN.
+	 */
+	if (ch_flt.flt_trapped_ce & CE_CEEN_NODEFER)
+	    set_error_enable(get_error_enable() | EN_REG_CEEN);
+	if (clear_errors(&ch_flt)) {
+		(void) cpu_queue_events(&ch_flt, pr_reason, ch_flt.afsr_errs,
+		    NULL);
+	}
+
+	/*
+	 * Panic here if aflt->flt_panic has been set.  Enqueued errors will
+	 * be logged as part of the panic flow.
+	 */
+	if (aflt->flt_panic)
+		fm_panic("%sError(s)", pr_reason);
+}
+
+/*
+ * The async_err handler transfers control here for UE, EMU, EDU:BLD,
+ * L3_EDU:BLD, TO, and BERR events.
+ * Deferred errors controlled by NCEEN: UE, EMU, EDU:BLD, L3_EDU:BLD, TO, BERR
+ *
+ * Cheetah+: No additional errors handled.
+ *
+ * Note that the p_clo_flags input is only valid in cases where the
+ * cpu_private struct is not yet initialized (since that is the only
+ * time that information cannot be obtained from the logout struct.)
+ */
+/*ARGSUSED*/
+void
+cpu_deferred_error(struct regs *rp, ulong_t p_clo_flags)
+{
+	ushort_t ttype, tl;
+	ch_async_flt_t ch_flt;
+	struct async_flt *aflt;
+	int trampolined = 0;
+	char pr_reason[MAX_REASON_STRING];
+	ch_cpu_logout_t *clop;
+	uint64_t ceen, clo_flags;
+	uint64_t log_afsr;
+	uint64_t t_afar, t_afsr, t_afsr_ext, t_afsr_errs;
+	ch_cpu_errors_t cpu_error_regs;
+	int expected = DDI_FM_ERR_UNEXPECTED;
+	ddi_acc_hdl_t *hp;
+
+	/*
+	 * We need to look at p_flag to determine if the thread detected an
+	 * error while dumping core.  We can't grab p_lock here, but it's ok
+	 * because we just need a consistent snapshot and we know that everyone
+	 * else will store a consistent set of bits while holding p_lock.  We
+	 * don't have to worry about a race because SDOCORE is set once prior
+	 * to doing i/o from the process's address space and is never cleared.
+	 */
+	uint_t pflag = ttoproc(curthread)->p_flag;
+
+	bzero(&ch_flt, sizeof (ch_async_flt_t));
+	/*
+	 * Get the CPU log out info. If we can't find our CPU private
+	 * pointer then we will have to make due without any detailed
+	 * logout information.
+	 */
+	if (CPU_PRIVATE(CPU) == NULL) {
+		clop = NULL;
+		ch_flt.flt_diag_data.chd_afar = LOGOUT_INVALID;
+		get_cpu_error_state(&cpu_error_regs);
+		set_cpu_error_state(&cpu_error_regs);
+		t_afar = cpu_error_regs.afar;
+		t_afsr = cpu_error_regs.afsr;
+		t_afsr_ext = cpu_error_regs.afsr_ext;
+#if defined(SERRANO)
+		ch_flt.afar2 = cpu_error_regs.afar2;
+#endif	/* SERRANO */
+		clo_flags = p_clo_flags;
+	} else {
+		clop = CPU_PRIVATE_PTR(CPU, chpr_async_logout);
+		t_afar = clop->clo_data.chd_afar;
+		t_afsr = clop->clo_data.chd_afsr;
+		t_afsr_ext = clop->clo_data.chd_afsr_ext;
+#if defined(SERRANO)
+		ch_flt.afar2 = clop->clo_data.chd_afar2;
+#endif	/* SERRANO */
+		clo_flags = clop->clo_flags;
+	}
+
+	/*
+	 * In order to simplify code, we maintain this afsr_errs
+	 * variable which holds the aggregate of AFSR and AFSR_EXT
+	 * sticky bits.
+	 */
+	t_afsr_errs = (t_afsr_ext & C_AFSR_EXT_ALL_ERRS) |
+	    (t_afsr & C_AFSR_ALL_ERRS);
+	pr_reason[0] = '\0';
+
+	/*
+	 * Grab information encoded into our clo_flags field.
+	 */
+	ceen = clo_flags & EN_REG_CEEN;
+	tl = (clo_flags & CLO_FLAGS_TL_MASK) >> CLO_FLAGS_TL_SHIFT;
+	ttype = (clo_flags & CLO_FLAGS_TT_MASK) >> CLO_FLAGS_TT_SHIFT;
+
+	/*
+	 * handle the specific error
+	 */
+	aflt = (struct async_flt *)&ch_flt;
+	aflt->flt_id = gethrtime_waitfree();
+	aflt->flt_bus_id = getprocessorid();
+	aflt->flt_inst = CPU->cpu_id;
+	ch_flt.afsr_ext = t_afsr_ext;
+	ch_flt.afsr_errs = t_afsr_errs;
+	aflt->flt_stat = t_afsr;
+	aflt->flt_addr = t_afar;
+	aflt->flt_pc = (caddr_t)rp->r_pc;
+	aflt->flt_prot = AFLT_PROT_NONE;
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_priv = (rp->r_tstate & TSTATE_PRIV) ?  1 : 0;
+	aflt->flt_tl = (uchar_t)tl;
+	aflt->flt_panic = ((tl != 0) || (aft_testfatal != 0) ||
+	    C_AFSR_PANIC(t_afsr_errs));
+	aflt->flt_core = (pflag & SDOCORE) ? 1 : 0;
+	aflt->flt_status = ((ttype == T_DATA_ERROR) ? ECC_D_TRAP : ECC_I_TRAP);
+
+	/*
+	 * If the trap occurred in privileged mode at TL=0, we need to check to
+	 * see if we were executing in the kernel under on_trap() or t_lofault
+	 * protection.  If so, modify the saved registers so that we return
+	 * from the trap to the appropriate trampoline routine.
+	 */
+	if (aflt->flt_priv && tl == 0) {
+		if (curthread->t_ontrap != NULL) {
+			on_trap_data_t *otp = curthread->t_ontrap;
+
+			if (otp->ot_prot & OT_DATA_EC) {
+				aflt->flt_prot = AFLT_PROT_EC;
+				otp->ot_trap |= OT_DATA_EC;
+				rp->r_pc = otp->ot_trampoline;
+				rp->r_npc = rp->r_pc + 4;
+				trampolined = 1;
+			}
+
+			if ((t_afsr & (C_AFSR_TO | C_AFSR_BERR)) &&
+			    (otp->ot_prot & OT_DATA_ACCESS)) {
+				aflt->flt_prot = AFLT_PROT_ACCESS;
+				otp->ot_trap |= OT_DATA_ACCESS;
+				rp->r_pc = otp->ot_trampoline;
+				rp->r_npc = rp->r_pc + 4;
+				trampolined = 1;
+				/*
+				 * for peeks and caut_gets errors are expected
+				 */
+				hp = (ddi_acc_hdl_t *)otp->ot_handle;
+				if (!hp)
+					expected = DDI_FM_ERR_PEEK;
+				else if (hp->ah_acc.devacc_attr_access ==
+				    DDI_CAUTIOUS_ACC)
+					expected = DDI_FM_ERR_EXPECTED;
+			}
+
+		} else if (curthread->t_lofault) {
+			aflt->flt_prot = AFLT_PROT_COPY;
+			rp->r_g1 = EFAULT;
+			rp->r_pc = curthread->t_lofault;
+			rp->r_npc = rp->r_pc + 4;
+			trampolined = 1;
+		}
+	}
+
+	/*
+	 * If we're in user mode or we're doing a protected copy, we either
+	 * want the ASTON code below to send a signal to the user process
+	 * or we want to panic if aft_panic is set.
+	 *
+	 * If we're in privileged mode and we're not doing a copy, then we
+	 * need to check if we've trampolined.  If we haven't trampolined,
+	 * we should panic.
+	 */
+	if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY) {
+		if (t_afsr_errs &
+		    ((C_AFSR_ASYNC_ERRS | C_AFSR_EXT_ASYNC_ERRS) &
+		    ~(C_AFSR_BERR | C_AFSR_TO)))
+			aflt->flt_panic |= aft_panic;
+	} else if (!trampolined) {
+			aflt->flt_panic = 1;
+	}
+
+	/*
+	 * If we've trampolined due to a privileged TO or BERR, or if an
+	 * unprivileged TO or BERR occurred, we don't want to enqueue an
+	 * event for that TO or BERR.  Queue all other events (if any) besides
+	 * the TO/BERR.  Since we may not be enqueing any events, we need to
+	 * ignore the number of events queued.  If we haven't trampolined due
+	 * to a TO or BERR, just enqueue events normally.
+	 */
+	log_afsr = t_afsr_errs;
+	if (trampolined) {
+		log_afsr &= ~(C_AFSR_TO | C_AFSR_BERR);
+	} else if (!aflt->flt_priv) {
+		/*
+		 * User mode, suppress messages if
+		 * cpu_berr_to_verbose is not set.
+		 */
+		if (!cpu_berr_to_verbose)
+			log_afsr &= ~(C_AFSR_TO | C_AFSR_BERR);
+	}
+
+	/*
+	 * Log any errors that occurred
+	 */
+	if (((log_afsr &
+		((C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS) & ~C_AFSR_ME)) &&
+		cpu_queue_events(&ch_flt, pr_reason, log_afsr, clop) == 0) ||
+		(t_afsr_errs &
+		(C_AFSR_ASYNC_ERRS | C_AFSR_EXT_ASYNC_ERRS)) == 0) {
+		ch_flt.flt_type = CPU_INV_AFSR;
+		cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_INVALID_AFSR,
+		    (void *)&ch_flt, sizeof (ch_async_flt_t), ue_queue,
+		    aflt->flt_panic);
+	}
+
+	/*
+	 * Zero out + invalidate CPU logout.
+	 */
+	if (clop) {
+		bzero(clop, sizeof (ch_cpu_logout_t));
+		clop->clo_data.chd_afar = LOGOUT_INVALID;
+	}
+
+#if defined(JALAPENO) || defined(SERRANO)
+	/*
+	 * UE/RUE/BERR/TO: Call our bus nexus friends to check for
+	 * IO errors that may have resulted in this trap.
+	 */
+	if (t_afsr & (C_AFSR_UE|C_AFSR_RUE|C_AFSR_TO|C_AFSR_BERR)) {
+		cpu_run_bus_error_handlers(aflt, expected);
+	}
+
+	/*
+	 * UE/RUE: If UE or RUE is in memory, we need to flush the bad
+	 * line from the Ecache.  We also need to query the bus nexus for
+	 * fatal errors.  Attempts to do diagnostic read on caches may
+	 * introduce more errors (especially when the module is bad).
+	 */
+	if (t_afsr & (C_AFSR_UE|C_AFSR_RUE)) {
+		/*
+		 * Ask our bus nexus friends if they have any fatal errors.  If
+		 * so, they will log appropriate error messages.
+		 */
+		if (bus_func_invoke(BF_TYPE_UE) == BF_FATAL)
+			aflt->flt_panic = 1;
+
+		/*
+		 * We got a UE or RUE and are panicking, save the fault PA in
+		 * a known location so that the platform specific panic code
+		 * can check for copyback errors.
+		 */
+		if (aflt->flt_panic && cpu_flt_in_memory(&ch_flt, C_AFSR_UE)) {
+			panic_aflt = *aflt;
+		}
+	}
+
+	/*
+	 * Flush Ecache line or entire Ecache
+	 */
+	if (t_afsr & (C_AFSR_UE | C_AFSR_RUE | C_AFSR_EDU | C_AFSR_BERR))
+		cpu_error_ecache_flush(&ch_flt);
+#else /* JALAPENO || SERRANO */
+	/*
+	 * UE/BERR/TO: Call our bus nexus friends to check for
+	 * IO errors that may have resulted in this trap.
+	 */
+	if (t_afsr & (C_AFSR_UE|C_AFSR_TO|C_AFSR_BERR)) {
+		cpu_run_bus_error_handlers(aflt, expected);
+	}
+
+	/*
+	 * UE: If the UE is in memory, we need to flush the bad
+	 * line from the Ecache.  We also need to query the bus nexus for
+	 * fatal errors.  Attempts to do diagnostic read on caches may
+	 * introduce more errors (especially when the module is bad).
+	 */
+	if (t_afsr & C_AFSR_UE) {
+		/*
+		 * Ask our legacy bus nexus friends if they have any fatal
+		 * errors.  If so, they will log appropriate error messages.
+		 */
+		if (bus_func_invoke(BF_TYPE_UE) == BF_FATAL)
+			aflt->flt_panic = 1;
+
+		/*
+		 * We got a UE and are panicking, save the fault PA in a known
+		 * location so that the platform specific panic code can check
+		 * for copyback errors.
+		 */
+		if (aflt->flt_panic && cpu_flt_in_memory(&ch_flt, C_AFSR_UE)) {
+			panic_aflt = *aflt;
+		}
+	}
+
+	/*
+	 * Flush Ecache line or entire Ecache
+	 */
+	if (t_afsr_errs &
+	    (C_AFSR_UE | C_AFSR_EDU | C_AFSR_BERR | C_AFSR_L3_EDU))
+		cpu_error_ecache_flush(&ch_flt);
+#endif /* JALAPENO || SERRANO */
+
+	/*
+	 * We carefully re-enable NCEEN and CEEN and then check if any deferred
+	 * or disrupting errors have happened.  We do this because if a
+	 * deferred or disrupting error had occurred with NCEEN/CEEN off, the
+	 * trap will not be taken when NCEEN/CEEN is re-enabled.  Note that
+	 * CEEN works differently on Cheetah than on Spitfire.  Also, we enable
+	 * NCEEN/CEEN *before* checking the AFSR to avoid the small window of a
+	 * deferred or disrupting error happening between checking the AFSR and
+	 * enabling NCEEN/CEEN.
+	 *
+	 * Note: CEEN reenabled only if it was on when trap taken.
+	 */
+	set_error_enable(get_error_enable() | (EN_REG_NCEEN | ceen));
+	if (clear_errors(&ch_flt)) {
+		/*
+		 * Check for secondary errors, and avoid panicking if we
+		 * have them
+		 */
+		if (cpu_check_secondary_errors(&ch_flt, t_afsr_errs,
+		    t_afar) == 0) {
+			aflt->flt_panic |= ((ch_flt.afsr_errs &
+			    (C_AFSR_ASYNC_ERRS | C_AFSR_EXT_ASYNC_ERRS)) != 0);
+		}
+		(void) cpu_queue_events(&ch_flt, pr_reason, ch_flt.afsr_errs,
+		    NULL);
+	}
+
+	/*
+	 * Panic here if aflt->flt_panic has been set.  Enqueued errors will
+	 * be logged as part of the panic flow.
+	 */
+	if (aflt->flt_panic)
+		fm_panic("%sError(s)", pr_reason);
+
+	/*
+	 * If we queued an error and we are going to return from the trap and
+	 * the error was in user mode or inside of a copy routine, set AST flag
+	 * so the queue will be drained before returning to user mode.  The
+	 * AST processing will also act on our failure policy.
+	 */
+	if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY) {
+		int pcb_flag = 0;
+
+		if (t_afsr_errs &
+		    (C_AFSR_ASYNC_ERRS | C_AFSR_EXT_ASYNC_ERRS &
+		    ~(C_AFSR_BERR | C_AFSR_TO)))
+			pcb_flag |= ASYNC_HWERR;
+
+		if (t_afsr & C_AFSR_BERR)
+			pcb_flag |= ASYNC_BERR;
+
+		if (t_afsr & C_AFSR_TO)
+			pcb_flag |= ASYNC_BTO;
+
+		ttolwp(curthread)->lwp_pcb.pcb_flags |= pcb_flag;
+		aston(curthread);
+	}
+}
+
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+/*
+ * Handling of data and instruction parity errors (traps 0x71, 0x72).
+ *
+ * For Panther, P$ data parity errors during floating point load hits
+ * are also detected (reported as TT 0x71) and handled by this trap
+ * handler.
+ *
+ * AFSR/AFAR are not set for parity errors, only TPC (a virtual address)
+ * is available.
+ */
+/*ARGSUSED*/
+void
+cpu_parity_error(struct regs *rp, uint_t flags, caddr_t tpc)
+{
+	ch_async_flt_t ch_flt;
+	struct async_flt *aflt;
+	uchar_t tl = ((flags & CH_ERR_TL) != 0);
+	uchar_t iparity = ((flags & CH_ERR_IPE) != 0);
+	uchar_t panic = ((flags & CH_ERR_PANIC) != 0);
+	char *error_class;
+
+	/*
+	 * Log the error.
+	 * For icache parity errors the fault address is the trap PC.
+	 * For dcache/pcache parity errors the instruction would have to
+	 * be decoded to determine the address and that isn't possible
+	 * at high PIL.
+	 */
+	bzero(&ch_flt, sizeof (ch_async_flt_t));
+	aflt = (struct async_flt *)&ch_flt;
+	aflt->flt_id = gethrtime_waitfree();
+	aflt->flt_bus_id = getprocessorid();
+	aflt->flt_inst = CPU->cpu_id;
+	aflt->flt_pc = tpc;
+	aflt->flt_addr = iparity ? (uint64_t)tpc : AFLT_INV_ADDR;
+	aflt->flt_prot = AFLT_PROT_NONE;
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_priv = (tl || (rp->r_tstate & TSTATE_PRIV)) ?  1 : 0;
+	aflt->flt_tl = tl;
+	aflt->flt_panic = panic;
+	aflt->flt_status = iparity ? ECC_IP_TRAP : ECC_DP_TRAP;
+	ch_flt.flt_type = iparity ? CPU_IC_PARITY : CPU_DC_PARITY;
+
+	if (iparity) {
+		cpu_icache_parity_info(&ch_flt);
+		if (ch_flt.parity_data.ipe.cpl_off != -1)
+			error_class = FM_EREPORT_CPU_USIII_IDSPE;
+		else if (ch_flt.parity_data.ipe.cpl_way != -1)
+			error_class = FM_EREPORT_CPU_USIII_ITSPE;
+		else
+			error_class = FM_EREPORT_CPU_USIII_IPE;
+		aflt->flt_payload = FM_EREPORT_PAYLOAD_ICACHE_PE;
+	} else {
+		cpu_dcache_parity_info(&ch_flt);
+		if (ch_flt.parity_data.dpe.cpl_off != -1)
+			error_class = FM_EREPORT_CPU_USIII_DDSPE;
+		else if (ch_flt.parity_data.dpe.cpl_way != -1)
+			error_class = FM_EREPORT_CPU_USIII_DTSPE;
+		else
+			error_class = FM_EREPORT_CPU_USIII_DPE;
+		aflt->flt_payload = FM_EREPORT_PAYLOAD_DCACHE_PE;
+		/*
+		 * For panther we also need to check the P$ for parity errors.
+		 */
+		if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+			cpu_pcache_parity_info(&ch_flt);
+			if (ch_flt.parity_data.dpe.cpl_cache == CPU_PC_PARITY) {
+				error_class = FM_EREPORT_CPU_USIII_PDSPE;
+				aflt->flt_payload =
+				    FM_EREPORT_PAYLOAD_PCACHE_PE;
+			}
+		}
+	}
+
+	cpu_errorq_dispatch(error_class, (void *)&ch_flt,
+	    sizeof (ch_async_flt_t), ue_queue, aflt->flt_panic);
+
+	if (iparity) {
+		/*
+		 * Invalidate entire I$.
+		 * This is required due to the use of diagnostic ASI
+		 * accesses that may result in a loss of I$ coherency.
+		 */
+		if (cache_boot_state & DCU_IC) {
+			flush_icache();
+		}
+		/*
+		 * According to section P.3.1 of the Panther PRM, we
+		 * need to do a little more for recovery on those
+		 * CPUs after encountering an I$ parity error.
+		 */
+		if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+			flush_ipb();
+			correct_dcache_parity(dcache_size,
+			    dcache_linesize);
+			flush_pcache();
+		}
+	} else {
+		/*
+		 * Since the valid bit is ignored when checking parity the
+		 * D$ data and tag must also be corrected.  Set D$ data bits
+		 * to zero and set utag to 0, 1, 2, 3.
+		 */
+		correct_dcache_parity(dcache_size, dcache_linesize);
+
+		/*
+		 * According to section P.3.3 of the Panther PRM, we
+		 * need to do a little more for recovery on those
+		 * CPUs after encountering a D$ or P$ parity error.
+		 *
+		 * As far as clearing P$ parity errors, it is enough to
+		 * simply invalidate all entries in the P$ since P$ parity
+		 * error traps are only generated for floating point load
+		 * hits.
+		 */
+		if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+			flush_icache();
+			flush_ipb();
+			flush_pcache();
+		}
+	}
+
+	/*
+	 * Invalidate entire D$ if it was enabled.
+	 * This is done to avoid stale data in the D$ which might
+	 * occur with the D$ disabled and the trap handler doing
+	 * stores affecting lines already in the D$.
+	 */
+	if (cache_boot_state & DCU_DC) {
+		flush_dcache();
+	}
+
+	/*
+	 * Restore caches to their bootup state.
+	 */
+	set_dcu(get_dcu() | cache_boot_state);
+
+	/*
+	 * Panic here if aflt->flt_panic has been set.  Enqueued errors will
+	 * be logged as part of the panic flow.
+	 */
+	if (aflt->flt_panic)
+		fm_panic("%sError(s)", iparity ? "IPE " : "DPE ");
+
+	/*
+	 * If this error occurred at TL>0 then flush the E$ here to reduce
+	 * the chance of getting an unrecoverable Fast ECC error.  This
+	 * flush will evict the part of the parity trap handler that is run
+	 * at TL>1.
+	 */
+	if (tl) {
+		cpu_flush_ecache();
+	}
+}
+
+/*
+ * On an I$ parity error, mark the appropriate entries in the ch_async_flt_t
+ * to indicate which portions of the captured data should be in the ereport.
+ */
+void
+cpu_async_log_ic_parity_err(ch_async_flt_t *ch_flt)
+{
+	int way = ch_flt->parity_data.ipe.cpl_way;
+	int offset = ch_flt->parity_data.ipe.cpl_off;
+	int tag_index;
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+
+
+	if ((offset != -1) || (way != -1)) {
+		/*
+		 * Parity error in I$ tag or data
+		 */
+		tag_index = ch_flt->parity_data.ipe.cpl_ic[way].ic_idx;
+		if (IS_PANTHER(cpunodes[aflt->flt_inst].implementation))
+			ch_flt->parity_data.ipe.cpl_ic[way].ic_way =
+			    PN_ICIDX_TO_WAY(tag_index);
+		else
+			ch_flt->parity_data.ipe.cpl_ic[way].ic_way =
+			    CH_ICIDX_TO_WAY(tag_index);
+		ch_flt->parity_data.ipe.cpl_ic[way].ic_logflag =
+		    IC_LOGFLAG_MAGIC;
+	} else {
+		/*
+		 * Parity error was not identified.
+		 * Log tags and data for all ways.
+		 */
+		for (way = 0; way < CH_ICACHE_NWAY; way++) {
+			tag_index = ch_flt->parity_data.ipe.cpl_ic[way].ic_idx;
+			if (IS_PANTHER(cpunodes[aflt->flt_inst].implementation))
+				ch_flt->parity_data.ipe.cpl_ic[way].ic_way =
+				    PN_ICIDX_TO_WAY(tag_index);
+			else
+				ch_flt->parity_data.ipe.cpl_ic[way].ic_way =
+				    CH_ICIDX_TO_WAY(tag_index);
+			ch_flt->parity_data.ipe.cpl_ic[way].ic_logflag =
+			    IC_LOGFLAG_MAGIC;
+		}
+	}
+}
+
+/*
+ * On an D$ parity error, mark the appropriate entries in the ch_async_flt_t
+ * to indicate which portions of the captured data should be in the ereport.
+ */
+void
+cpu_async_log_dc_parity_err(ch_async_flt_t *ch_flt)
+{
+	int way = ch_flt->parity_data.dpe.cpl_way;
+	int offset = ch_flt->parity_data.dpe.cpl_off;
+	int tag_index;
+
+	if (offset != -1) {
+		/*
+		 * Parity error in D$ or P$ data array.
+		 *
+		 * First check to see whether the parity error is in D$ or P$
+		 * since P$ data parity errors are reported in Panther using
+		 * the same trap.
+		 */
+		if (ch_flt->parity_data.dpe.cpl_cache == CPU_PC_PARITY) {
+			tag_index = ch_flt->parity_data.dpe.cpl_pc[way].pc_idx;
+			ch_flt->parity_data.dpe.cpl_pc[way].pc_way =
+			    CH_PCIDX_TO_WAY(tag_index);
+			ch_flt->parity_data.dpe.cpl_pc[way].pc_logflag =
+			    PC_LOGFLAG_MAGIC;
+		} else {
+			tag_index = ch_flt->parity_data.dpe.cpl_dc[way].dc_idx;
+			ch_flt->parity_data.dpe.cpl_dc[way].dc_way =
+			    CH_DCIDX_TO_WAY(tag_index);
+			ch_flt->parity_data.dpe.cpl_dc[way].dc_logflag =
+			    DC_LOGFLAG_MAGIC;
+		}
+	} else if (way != -1) {
+		/*
+		 * Parity error in D$ tag.
+		 */
+		tag_index = ch_flt->parity_data.dpe.cpl_dc[way].dc_idx;
+		ch_flt->parity_data.dpe.cpl_dc[way].dc_way =
+		    CH_DCIDX_TO_WAY(tag_index);
+		ch_flt->parity_data.dpe.cpl_dc[way].dc_logflag =
+		    DC_LOGFLAG_MAGIC;
+	}
+}
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+
+/*
+ * The cpu_async_log_err() function is called via the [uc]e_drain() function to
+ * post-process CPU events that are dequeued.  As such, it can be invoked
+ * from softint context, from AST processing in the trap() flow, or from the
+ * panic flow.  We decode the CPU-specific data, and take appropriate actions.
+ * Historically this entry point was used to log the actual cmn_err(9F) text;
+ * now with FMA it is used to prepare 'flt' to be converted into an ereport.
+ * With FMA this function now also returns a flag which indicates to the
+ * caller whether the ereport should be posted (1) or suppressed (0).
+ */
+static int
+cpu_async_log_err(void *flt, errorq_elem_t *eqep)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt;
+	struct async_flt *aflt = (struct async_flt *)flt;
+	page_t *pp;
+
+	switch (ch_flt->flt_type) {
+	case CPU_INV_AFSR:
+		/*
+		 * If it is a disrupting trap and the AFSR is zero, then
+		 * the event has probably already been noted. Do not post
+		 * an ereport.
+		 */
+		if ((aflt->flt_status & ECC_C_TRAP) &&
+		    (!(aflt->flt_stat & C_AFSR_MASK)))
+			return (0);
+		else
+			return (1);
+	case CPU_TO:
+	case CPU_BERR:
+	case CPU_FATAL:
+	case CPU_FPUERR:
+		return (1);
+
+	case CPU_UE_ECACHE_RETIRE:
+		cpu_log_err(aflt);
+		cpu_page_retire(ch_flt);
+		return (1);
+
+	/*
+	 * Cases where we may want to suppress logging or perform
+	 * extended diagnostics.
+	 */
+	case CPU_CE:
+	case CPU_EMC:
+		pp = page_numtopp_nolock((pfn_t)
+		    (aflt->flt_addr >> MMU_PAGESHIFT));
+
+		/*
+		 * We want to skip logging and further classification
+		 * only if ALL the following conditions are true:
+		 *
+		 *	1. There is only one error
+		 *	2. That error is a correctable memory error
+		 *	3. The error is caused by the memory scrubber (in
+		 *	   which case the error will have occurred under
+		 *	   on_trap protection)
+		 *	4. The error is on a retired page
+		 *
+		 * Note: AFLT_PROT_EC is used places other than the memory
+		 * scrubber.  However, none of those errors should occur
+		 * on a retired page.
+		 */
+		if ((ch_flt->afsr_errs &
+		    (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE &&
+		    aflt->flt_prot == AFLT_PROT_EC) {
+
+			if (pp != NULL && page_isretired(pp)) {
+			    if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
+
+				/*
+				 * Since we're skipping logging, we'll need
+				 * to schedule the re-enabling of CEEN
+				 */
+				(void) timeout(cpu_delayed_check_ce_errors,
+				    (void *)aflt->flt_inst, drv_usectohz(
+				    (clock_t)cpu_ceen_delay_secs * MICROSEC));
+			    }
+			    return (0);
+			}
+		}
+
+		/*
+		 * Perform/schedule further classification actions, but
+		 * only if the page is healthy (we don't want bad
+		 * pages inducing too much diagnostic activity).  If we could
+		 * not find a page pointer then we also skip this.  If
+		 * ce_scrub_xdiag_recirc returns nonzero then it has chosen
+		 * to copy and recirculate the event (for further diagnostics)
+		 * and we should not proceed to log it here.
+		 *
+		 * This must be the last step here before the cpu_log_err()
+		 * below - if an event recirculates cpu_ce_log_err() will
+		 * not call the current function but just proceed directly
+		 * to cpu_ereport_post after the cpu_log_err() avoided below.
+		 *
+		 * Note: Check cpu_impl_async_log_err if changing this
+		 */
+		if (pp) {
+			if (page_isretired(pp) || page_deteriorating(pp)) {
+				CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+				    CE_XDIAG_SKIP_PAGEDET);
+			} else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep,
+			    offsetof(ch_async_flt_t, cmn_asyncflt))) {
+				return (0);
+			}
+		} else {
+			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+			    CE_XDIAG_SKIP_NOPP);
+		}
+		/*FALLTHRU*/
+
+	/*
+	 * Cases where we just want to report the error and continue.
+	 */
+	case CPU_CE_ECACHE:
+	case CPU_UE_ECACHE:
+	case CPU_IV:
+	case CPU_ORPH:
+		cpu_log_err(aflt);
+		return (1);
+
+	/*
+	 * Cases where we want to fall through to handle panicking.
+	 */
+	case CPU_UE:
+		/*
+		 * We want to skip logging in the same conditions as the
+		 * CE case.  In addition, we want to make sure we're not
+		 * panicking.
+		 */
+		if (!panicstr && (ch_flt->afsr_errs &
+		    (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_UE &&
+		    aflt->flt_prot == AFLT_PROT_EC) {
+			page_t *pp = page_numtopp_nolock((pfn_t)
+			    (aflt->flt_addr >> MMU_PAGESHIFT));
+
+			if (pp != NULL && page_isretired(pp)) {
+
+				/* Zero the address to clear the error */
+				softcall(ecc_page_zero, (void *)aflt->flt_addr);
+				return (0);
+			}
+		}
+		cpu_log_err(aflt);
+		break;
+
+	default:
+		/*
+		 * If the us3_common.c code doesn't know the flt_type, it may
+		 * be an implementation-specific code.  Call into the impldep
+		 * backend to find out what to do: if it tells us to continue,
+		 * break and handle as if falling through from a UE; if not,
+		 * the impldep backend has handled the error and we're done.
+		 */
+		switch (cpu_impl_async_log_err(flt, eqep)) {
+		case CH_ASYNC_LOG_DONE:
+			return (1);
+		case CH_ASYNC_LOG_RECIRC:
+			return (0);
+		case CH_ASYNC_LOG_CONTINUE:
+			break; /* continue on to handle UE-like error */
+		default:
+			cmn_err(CE_WARN, "discarding error 0x%p with "
+			    "invalid fault type (0x%x)",
+			    (void *)aflt, ch_flt->flt_type);
+			return (0);
+		}
+	}
+
+	/* ... fall through from the UE case */
+
+	if (aflt->flt_addr != AFLT_INV_ADDR && aflt->flt_in_memory) {
+		if (!panicstr) {
+			cpu_page_retire(ch_flt);
+		} else {
+			/*
+			 * Clear UEs on panic so that we don't
+			 * get haunted by them during panic or
+			 * after reboot
+			 */
+			cpu_clearphys(aflt);
+			(void) clear_errors(NULL);
+		}
+	}
+
+	return (1);
+}
+
+/*
+ * Retire the bad page that may contain the flushed error.
+ */
+void
+cpu_page_retire(ch_async_flt_t *ch_flt)
+{
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+	page_t *pp = page_numtopp_nolock(aflt->flt_addr >> MMU_PAGESHIFT);
+
+	if (pp != NULL) {
+		page_settoxic(pp, PAGE_IS_FAULTY);
+		(void) page_retire(pp, PAGE_IS_TOXIC);
+	}
+}
+
+/*
+ * The cpu_log_err() function is called by cpu_async_log_err() to perform the
+ * generic event post-processing for correctable and uncorrectable memory,
+ * E$, and MTag errors.  Historically this entry point was used to log bits of
+ * common cmn_err(9F) text; now with FMA it is used to prepare 'flt' to be
+ * converted into an ereport.  In addition, it transmits the error to any
+ * platform-specific service-processor FRU logging routines, if available.
+ */
+void
+cpu_log_err(struct async_flt *aflt)
+{
+	char unum[UNUM_NAMLEN];
+	int len = 0;
+	int synd_status, synd_code, afar_status;
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+
+	/*
+	 * Need to turn on ECC_ECACHE for plat_get_mem_unum().
+	 * For Panther, L2$ is not external, so we don't want to
+	 * generate an E$ unum for those errors.
+	 */
+	if (IS_PANTHER(cpunodes[aflt->flt_inst].implementation)) {
+		if (ch_flt->flt_bit & C_AFSR_EXT_L3_ERRS)
+			aflt->flt_status |= ECC_ECACHE;
+	} else {
+		if (ch_flt->flt_bit & C_AFSR_ECACHE)
+			aflt->flt_status |= ECC_ECACHE;
+	}
+
+	/*
+	 * Determine syndrome status.
+	 */
+	synd_status = afsr_to_synd_status(aflt->flt_inst,
+	    ch_flt->afsr_errs, ch_flt->flt_bit);
+
+	/*
+	 * Determine afar status.
+	 */
+	if (pf_is_memory(aflt->flt_addr >> MMU_PAGESHIFT))
+		afar_status = afsr_to_afar_status(ch_flt->afsr_errs,
+				ch_flt->flt_bit);
+	else
+		afar_status = AFLT_STAT_INVALID;
+
+	/*
+	 * If afar status is not invalid do a unum lookup.
+	 */
+	if (afar_status != AFLT_STAT_INVALID) {
+		(void) cpu_get_mem_unum_aflt(synd_status, aflt, unum,
+			UNUM_NAMLEN, &len);
+	} else {
+		unum[0] = '\0';
+	}
+
+	synd_code = synd_to_synd_code(synd_status,
+	    aflt->flt_synd, ch_flt->flt_bit);
+
+	/*
+	 * Do not send the fruid message (plat_ecc_error_data_t)
+	 * to the SC if it can handle the enhanced error information
+	 * (plat_ecc_error2_data_t) or when the tunable
+	 * ecc_log_fruid_enable is set to 0.
+	 */
+
+	if (&plat_ecc_capability_sc_get &&
+	    plat_ecc_capability_sc_get(PLAT_ECC_ERROR_MESSAGE)) {
+		if (&plat_log_fruid_error)
+			plat_log_fruid_error(synd_code, aflt, unum,
+			    ch_flt->flt_bit);
+	}
+
+	if (aflt->flt_func != NULL)
+		aflt->flt_func(aflt, unum);
+
+	if (afar_status != AFLT_STAT_INVALID)
+		cpu_log_diag_info(ch_flt);
+
+	/*
+	 * If we have a CEEN error , we do not reenable CEEN until after
+	 * we exit the trap handler. Otherwise, another error may
+	 * occur causing the handler to be entered recursively.
+	 * We set a timeout to trigger in cpu_ceen_delay_secs seconds,
+	 * to try and ensure that the CPU makes progress in the face
+	 * of a CE storm.
+	 */
+	if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
+		(void) timeout(cpu_delayed_check_ce_errors,
+		    (void *)aflt->flt_inst,
+		    drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
+	}
+}
+
+/*
+ * Invoked by error_init() early in startup and therefore before
+ * startup_errorq() is called to drain any error Q -
+ *
+ * startup()
+ *   startup_end()
+ *     error_init()
+ *       cpu_error_init()
+ * errorq_init()
+ *   errorq_drain()
+ * start_other_cpus()
+ *
+ * The purpose of this routine is to create error-related taskqs.  Taskqs
+ * are used for this purpose because cpu_lock can't be grabbed from interrupt
+ * context.
+ */
+void
+cpu_error_init(int items)
+{
+	/*
+	 * Create taskq(s) to reenable CE
+	 */
+	ch_check_ce_tq = taskq_create("cheetah_check_ce", 1, minclsyspri,
+	    items, items, TASKQ_PREPOPULATE);
+}
+
+void
+cpu_ce_log_err(struct async_flt *aflt, errorq_elem_t *eqep)
+{
+	char unum[UNUM_NAMLEN];
+	int len;
+
+	switch (aflt->flt_class) {
+	case CPU_FAULT:
+		cpu_ereport_init(aflt);
+		if (cpu_async_log_err(aflt, eqep))
+			cpu_ereport_post(aflt);
+		break;
+
+	case BUS_FAULT:
+		if (aflt->flt_func != NULL) {
+			(void) cpu_get_mem_unum_aflt(AFLT_STAT_VALID, aflt,
+			    unum, UNUM_NAMLEN, &len);
+			aflt->flt_func(aflt, unum);
+		}
+		break;
+
+	case RECIRC_CPU_FAULT:
+		aflt->flt_class = CPU_FAULT;
+		cpu_log_err(aflt);
+		cpu_ereport_post(aflt);
+		break;
+
+	case RECIRC_BUS_FAULT:
+		ASSERT(aflt->flt_class != RECIRC_BUS_FAULT);
+		/*FALLTHRU*/
+	default:
+		cmn_err(CE_WARN, "discarding CE error 0x%p with invalid "
+		    "fault class (0x%x)", (void *)aflt, aflt->flt_class);
+		return;
+	}
+}
+
+/*
+ * Scrub and classify a CE.  This function must not modify the
+ * fault structure passed to it but instead should return the classification
+ * information.
+ */
+
+static uchar_t
+cpu_ce_scrub_mem_err_common(struct async_flt *ecc, boolean_t logout_tried)
+{
+	uchar_t disp = CE_XDIAG_EXTALG;
+	on_trap_data_t otd;
+	uint64_t orig_err;
+	ch_cpu_logout_t *clop;
+
+	/*
+	 * Clear CEEN.  CPU CE TL > 0 trap handling will already have done
+	 * this, but our other callers have not.  Disable preemption to
+	 * avoid CPU migration so that we restore CEEN on the correct
+	 * cpu later.
+	 *
+	 * CEEN is cleared so that further CEs that our instruction and
+	 * data footprint induce do not cause use to either creep down
+	 * kernel stack to the point of overflow, or do so much CE
+	 * notification as to make little real forward progress.
+	 *
+	 * NCEEN must not be cleared.  However it is possible that
+	 * our accesses to the flt_addr may provoke a bus error or timeout
+	 * if the offending address has just been unconfigured as part of
+	 * a DR action.  So we must operate under on_trap protection.
+	 */
+	kpreempt_disable();
+	orig_err = get_error_enable();
+	if (orig_err & EN_REG_CEEN)
+	    set_error_enable(orig_err & ~EN_REG_CEEN);
+
+	/*
+	 * Our classification algorithm includes the line state before
+	 * the scrub; we'd like this captured after the detection and
+	 * before the algorithm below - the earlier the better.
+	 *
+	 * If we've come from a cpu CE trap then this info already exists
+	 * in the cpu logout area.
+	 *
+	 * For a CE detected by memscrub for which there was no trap
+	 * (running with CEEN off) cpu_log_and_clear_ce has called
+	 * cpu_ce_delayed_ec_logout to capture some cache data, and
+	 * marked the fault structure as incomplete as a flag to later
+	 * logging code.
+	 *
+	 * If called directly from an IO detected CE there has been
+	 * no line data capture.  In this case we logout to the cpu logout
+	 * area - that's appropriate since it's the cpu cache data we need
+	 * for classification.  We thus borrow the cpu logout area for a
+	 * short time, and cpu_ce_delayed_ec_logout will mark it as busy in
+	 * this time (we will invalidate it again below).
+	 *
+	 * If called from the partner check xcall handler then this cpu
+	 * (the partner) has not necessarily experienced a CE at this
+	 * address.  But we want to capture line state before its scrub
+	 * attempt since we use that in our classification.
+	 */
+	if (logout_tried == B_FALSE) {
+		if (!cpu_ce_delayed_ec_logout(ecc->flt_addr))
+			disp |= CE_XDIAG_NOLOGOUT;
+	}
+
+	/*
+	 * Scrub memory, then check AFSR for errors.  The AFAR we scrub may
+	 * no longer be valid (if DR'd since the initial event) so we
+	 * perform this scrub under on_trap protection.  If this access is
+	 * ok then further accesses below will also be ok - DR cannot
+	 * proceed while this thread is active (preemption is disabled);
+	 * to be safe we'll nonetheless use on_trap again below.
+	 */
+	if (!on_trap(&otd, OT_DATA_ACCESS)) {
+		cpu_scrubphys(ecc);
+	} else {
+		no_trap();
+		if (orig_err & EN_REG_CEEN)
+		    set_error_enable(orig_err);
+		kpreempt_enable();
+		return (disp);
+	}
+	no_trap();
+
+	/*
+	 * Did the casx read of the scrub log a CE that matches the AFAR?
+	 * Note that it's quite possible that the read sourced the data from
+	 * another cpu.
+	 */
+	if (clear_ecc(ecc))
+		disp |= CE_XDIAG_CE1;
+
+	/*
+	 * Read the data again.  This time the read is very likely to
+	 * come from memory since the scrub induced a writeback to memory.
+	 */
+	if (!on_trap(&otd, OT_DATA_ACCESS)) {
+		(void) lddphys(P2ALIGN(ecc->flt_addr, 8));
+	} else {
+		no_trap();
+		if (orig_err & EN_REG_CEEN)
+		    set_error_enable(orig_err);
+		kpreempt_enable();
+		return (disp);
+	}
+	no_trap();
+
+	/* Did that read induce a CE that matches the AFAR? */
+	if (clear_ecc(ecc))
+		disp |= CE_XDIAG_CE2;
+
+	/*
+	 * Look at the logout information and record whether we found the
+	 * line in l2/l3 cache.  For Panther we are interested in whether
+	 * we found it in either cache (it won't reside in both but
+	 * it is possible to read it that way given the moving target).
+	 */
+	clop = CPU_PRIVATE(CPU) ? CPU_PRIVATE_PTR(CPU, chpr_cecc_logout) : NULL;
+	if (!(disp & CE_XDIAG_NOLOGOUT) && clop &&
+	    clop->clo_data.chd_afar != LOGOUT_INVALID) {
+		int hit, level;
+		int state;
+		int totalsize;
+		ch_ec_data_t *ecp;
+
+		/*
+		 * If hit is nonzero then a match was found and hit will
+		 * be one greater than the index which hit.  For Panther we
+		 * also need to pay attention to level to see which of l2$ or
+		 * l3$ it hit in.
+		 */
+		hit = cpu_matching_ecache_line(ecc->flt_addr, &clop->clo_data,
+		    0, &level);
+
+		if (hit) {
+			--hit;
+			disp |= CE_XDIAG_AFARMATCH;
+
+			if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+				if (level == 2)
+					ecp = &clop->clo_data.chd_l2_data[hit];
+				else
+					ecp = &clop->clo_data.chd_ec_data[hit];
+			} else {
+				ASSERT(level == 2);
+				ecp = &clop->clo_data.chd_ec_data[hit];
+			}
+			totalsize = cpunodes[CPU->cpu_id].ecache_size;
+			state = cpu_ectag_pa_to_subblk_state(totalsize,
+			    ecc->flt_addr, ecp->ec_tag);
+
+			/*
+			 * Cheetah variants use different state encodings -
+			 * the CH_ECSTATE_* defines vary depending on the
+			 * module we're compiled for.  Translate into our
+			 * one true version.  Conflate Owner-Shared state
+			 * of SSM mode with Owner as victimisation of such
+			 * lines may cause a writeback.
+			 */
+			switch (state) {
+			case CH_ECSTATE_MOD:
+				disp |= EC_STATE_M;
+				break;
+
+			case CH_ECSTATE_OWN:
+			case CH_ECSTATE_OWS:
+				disp |= EC_STATE_O;
+				break;
+
+			case CH_ECSTATE_EXL:
+				disp |= EC_STATE_E;
+				break;
+
+			case CH_ECSTATE_SHR:
+				disp |= EC_STATE_S;
+				break;
+
+			default:
+				disp |= EC_STATE_I;
+				break;
+			}
+		}
+
+		/*
+		 * If we initiated the delayed logout then we are responsible
+		 * for invalidating the logout area.
+		 */
+		if (logout_tried == B_FALSE) {
+			bzero(clop, sizeof (ch_cpu_logout_t));
+			clop->clo_data.chd_afar = LOGOUT_INVALID;
+		}
+	}
+
+	/*
+	 * Re-enable CEEN if we turned it off.
+	 */
+	if (orig_err & EN_REG_CEEN)
+	    set_error_enable(orig_err);
+	kpreempt_enable();
+
+	return (disp);
+}
+
+/*
+ * Scrub a correctable memory error and collect data for classification
+ * of CE type.  This function is called in the detection path, ie tl0 handling
+ * of a correctable error trap (cpus) or interrupt (IO) at high PIL.
+ */
+void
+cpu_ce_scrub_mem_err(struct async_flt *ecc, boolean_t logout_tried)
+{
+	/*
+	 * Cheetah CE classification does not set any bits in flt_status.
+	 * Instead we will record classification datapoints in flt_disp.
+	 */
+	ecc->flt_status &= ~(ECC_INTERMITTENT | ECC_PERSISTENT | ECC_STICKY);
+
+	/*
+	 * To check if the error detected by IO is persistent, sticky or
+	 * intermittent.  This is noticed by clear_ecc().
+	 */
+	if (ecc->flt_status & ECC_IOBUS)
+		ecc->flt_stat = C_AFSR_MEMORY;
+
+	/*
+	 * Record information from this first part of the algorithm in
+	 * flt_disp.
+	 */
+	ecc->flt_disp = cpu_ce_scrub_mem_err_common(ecc, logout_tried);
+}
+
+/*
+ * Select a partner to perform a further CE classification check from.
+ * Must be called with kernel preemption disabled (to stop the cpu list
+ * from changing).  The detecting cpu we are partnering has cpuid
+ * aflt->flt_inst; we might not be running on the detecting cpu.
+ *
+ * Restrict choice to active cpus in the same cpu partition as ourselves in
+ * an effort to stop bad cpus in one partition causing other partitions to
+ * perform excessive diagnostic activity.  Actually since the errorq drain
+ * is run from a softint most of the time and that is a global mechanism
+ * this isolation is only partial.  Return NULL if we fail to find a
+ * suitable partner.
+ *
+ * We prefer a partner that is in a different latency group to ourselves as
+ * we will share fewer datapaths.  If such a partner is unavailable then
+ * choose one in the same lgroup but prefer a different chip and only allow
+ * a sibling core if flags includes PTNR_SIBLINGOK.  If all else fails and
+ * flags includes PTNR_SELFOK then permit selection of the original detector.
+ *
+ * We keep a cache of the last partner selected for a cpu, and we'll try to
+ * use that previous partner if no more than cpu_ce_ptnr_cachetime_sec seconds
+ * have passed since that selection was made.  This provides the benefit
+ * of the point-of-view of different partners over time but without
+ * requiring frequent cpu list traversals.
+ */
+
+#define	PTNR_SIBLINGOK	0x1	/* Allow selection of sibling core */
+#define	PTNR_SELFOK	0x2	/* Allow selection of cpu to "partner" itself */
+
+static cpu_t *
+ce_ptnr_select(struct async_flt *aflt, int flags, int *typep)
+{
+	cpu_t *sp, *dtcr, *ptnr, *locptnr, *sibptnr;
+	hrtime_t lasttime, thistime;
+
+	ASSERT(curthread->t_preempt > 0 || getpil() >= DISP_LEVEL);
+
+	dtcr = cpu[aflt->flt_inst];
+
+	/*
+	 * Short-circuit for the following cases:
+	 *	. the dtcr is not flagged active
+	 *	. there is just one cpu present
+	 *	. the detector has disappeared
+	 *	. we were given a bad flt_inst cpuid; this should not happen
+	 *	  (eg PCI code now fills flt_inst) but if it does it is no
+	 *	  reason to panic.
+	 *	. there is just one cpu left online in the cpu partition
+	 *
+	 * If we return NULL after this point then we do not update the
+	 * chpr_ceptnr_seltime which will cause us to perform a full lookup
+	 * again next time; this is the case where the only other cpu online
+	 * in the detector's partition is on the same chip as the detector
+	 * and since CEEN re-enable is throttled even that case should not
+	 * hurt performance.
+	 */
+	if (dtcr == NULL || !cpu_flagged_active(dtcr->cpu_flags)) {
+		return (NULL);
+	}
+	if (ncpus == 1 || dtcr->cpu_part->cp_ncpus == 1) {
+		if (flags & PTNR_SELFOK) {
+			*typep = CE_XDIAG_PTNR_SELF;
+			return (dtcr);
+		} else {
+			return (NULL);
+		}
+	}
+
+	thistime = gethrtime();
+	lasttime = CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_seltime);
+
+	/*
+	 * Select a starting point.
+	 */
+	if (!lasttime) {
+		/*
+		 * We've never selected a partner for this detector before.
+		 * Start the scan at the next online cpu in the same cpu
+		 * partition.
+		 */
+		sp = dtcr->cpu_next_part;
+	} else if (thistime - lasttime < cpu_ce_ptnr_cachetime_sec * NANOSEC) {
+		/*
+		 * Our last selection has not aged yet.  If this partner:
+		 *	. is still a valid cpu,
+		 *	. is still in the same partition as the detector
+		 *	. is still marked active
+		 *	. satisfies the 'flags' argument criteria
+		 * then select it again without updating the timestamp.
+		 */
+		sp = cpu[CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_id)];
+		if (sp == NULL || sp->cpu_part != dtcr->cpu_part ||
+		    !cpu_flagged_active(sp->cpu_flags) ||
+		    (sp == dtcr && !(flags & PTNR_SELFOK)) ||
+		    (sp->cpu_chip->chip_id == dtcr->cpu_chip->chip_id &&
+		    !(flags & PTNR_SIBLINGOK))) {
+			sp = dtcr->cpu_next_part;
+		} else {
+			if (sp->cpu_lpl->lpl_lgrp != dtcr->cpu_lpl->lpl_lgrp) {
+				*typep = CE_XDIAG_PTNR_REMOTE;
+			} else if (sp == dtcr) {
+				*typep = CE_XDIAG_PTNR_SELF;
+			} else if (sp->cpu_chip->chip_id ==
+			    dtcr->cpu_chip->chip_id) {
+				*typep = CE_XDIAG_PTNR_SIBLING;
+			} else {
+				*typep = CE_XDIAG_PTNR_LOCAL;
+			}
+			return (sp);
+		}
+	} else {
+		/*
+		 * Our last selection has aged.  If it is nonetheless still a
+		 * valid cpu then start the scan at the next cpu in the
+		 * partition after our last partner.  If the last selection
+		 * is no longer a valid cpu then go with our default.  In
+		 * this way we slowly cycle through possible partners to
+		 * obtain multiple viewpoints over time.
+		 */
+		sp = cpu[CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_id)];
+		if (sp == NULL) {
+			sp = dtcr->cpu_next_part;
+		} else {
+			sp = sp->cpu_next_part;		/* may be dtcr */
+			if (sp->cpu_part != dtcr->cpu_part)
+				sp = dtcr;
+		}
+	}
+
+	/*
+	 * We have a proposed starting point for our search, but if this
+	 * cpu is offline then its cpu_next_part will point to itself
+	 * so we can't use that to iterate over cpus in this partition in
+	 * the loop below.  We still want to avoid iterating over cpus not
+	 * in our partition, so in the case that our starting point is offline
+	 * we will repoint it to be the detector itself;  and if the detector
+	 * happens to be offline we'll return NULL from the following loop.
+	 */
+	if (!cpu_flagged_active(sp->cpu_flags)) {
+		sp = dtcr;
+	}
+
+	ptnr = sp;
+	locptnr = NULL;
+	sibptnr = NULL;
+	do {
+		if (ptnr == dtcr || !cpu_flagged_active(ptnr->cpu_flags))
+			continue;
+		if (ptnr->cpu_lpl->lpl_lgrp != dtcr->cpu_lpl->lpl_lgrp) {
+			CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_id) = ptnr->cpu_id;
+			CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_seltime) = thistime;
+			*typep = CE_XDIAG_PTNR_REMOTE;
+			return (ptnr);
+		}
+		if (ptnr->cpu_chip->chip_id == dtcr->cpu_chip->chip_id) {
+			if (sibptnr == NULL)
+				sibptnr = ptnr;
+			continue;
+		}
+		if (locptnr == NULL)
+			locptnr = ptnr;
+	} while ((ptnr = ptnr->cpu_next_part) != sp);
+
+	/*
+	 * A foreign partner has already been returned if one was available.
+	 *
+	 * If locptnr is not NULL it is a cpu in the same lgroup as the
+	 * detector, is active, and is not a sibling of the detector.
+	 *
+	 * If sibptnr is not NULL it is a sibling of the detector, and is
+	 * active.
+	 *
+	 * If we have to resort to using the detector itself we have already
+	 * checked that it is active.
+	 */
+	if (locptnr) {
+		CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_id) = locptnr->cpu_id;
+		CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_seltime) = thistime;
+		*typep = CE_XDIAG_PTNR_LOCAL;
+		return (locptnr);
+	} else if (sibptnr && flags & PTNR_SIBLINGOK) {
+		CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_id) = sibptnr->cpu_id;
+		CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_seltime) = thistime;
+		*typep = CE_XDIAG_PTNR_SIBLING;
+		return (sibptnr);
+	} else if (flags & PTNR_SELFOK) {
+		CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_id) = dtcr->cpu_id;
+		CPU_PRIVATE_VAL(dtcr, chpr_ceptnr_seltime) = thistime;
+		*typep = CE_XDIAG_PTNR_SELF;
+		return (dtcr);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Cross call handler that is requested to run on the designated partner of
+ * a cpu that experienced a possibly sticky or possibly persistnet CE.
+ */
+static void
+ce_ptnrchk_xc(struct async_flt *aflt, uchar_t *dispp)
+{
+	*dispp = cpu_ce_scrub_mem_err_common(aflt, B_FALSE);
+}
+
+/*
+ * The associated errorqs are never destroyed so we do not need to deal with
+ * them disappearing before this timeout fires.  If the affected memory
+ * has been DR'd out since the original event the scrub algrithm will catch
+ * any errors and return null disposition info.  If the original detecting
+ * cpu has been DR'd out then ereport detector info will not be able to
+ * lookup CPU type;  with a small timeout this is unlikely.
+ */
+static void
+ce_lkychk_cb(ce_lkychk_cb_t *cbarg)
+{
+	struct async_flt *aflt = cbarg->lkycb_aflt;
+	uchar_t disp;
+	cpu_t *cp;
+	int ptnrtype;
+
+	kpreempt_disable();
+	if (cp = ce_ptnr_select(aflt, PTNR_SIBLINGOK | PTNR_SELFOK,
+	    &ptnrtype)) {
+		xc_one(cp->cpu_id, (xcfunc_t *)ce_ptnrchk_xc, (uint64_t)aflt,
+		    (uint64_t)&disp);
+		CE_XDIAG_SETLKYINFO(aflt->flt_disp, disp);
+		CE_XDIAG_SETPTNRID(aflt->flt_disp, cp->cpu_id);
+		CE_XDIAG_SETPTNRTYPE(aflt->flt_disp, ptnrtype);
+	} else {
+		ce_xdiag_lkydrops++;
+		if (ncpus > 1)
+			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+			    CE_XDIAG_SKIP_NOPTNR);
+	}
+	kpreempt_enable();
+
+	errorq_commit(cbarg->lkycb_eqp, cbarg->lkycb_eqep, ERRORQ_ASYNC);
+	kmem_free(cbarg, sizeof (ce_lkychk_cb_t));
+}
+
+/*
+ * Called from errorq drain code when processing a CE error, both from
+ * CPU and PCI drain functions.  Decide what further classification actions,
+ * if any, we will perform.  Perform immediate actions now, and schedule
+ * delayed actions as required.  Note that we are no longer necessarily running
+ * on the detecting cpu, and that the async_flt structure will not persist on
+ * return from this function.
+ *
+ * Calls to this function should aim to be self-throtlling in some way.  With
+ * the delayed re-enable of CEEN the absolute rate of calls should not
+ * be excessive.  Callers should also avoid performing in-depth classification
+ * for events in pages that are already known to be suspect.
+ *
+ * We return nonzero to indicate that the event has been copied and
+ * recirculated for further testing.  The caller should not log the event
+ * in this case - it will be logged when further test results are available.
+ *
+ * Our possible contexts are that of errorq_drain: below lock level or from
+ * panic context.  We can assume that the cpu we are running on is online.
+ */
+
+
+#ifdef DEBUG
+static int ce_xdiag_forceaction;
+#endif
+
+int
+ce_scrub_xdiag_recirc(struct async_flt *aflt, errorq_t *eqp,
+    errorq_elem_t *eqep, size_t afltoffset)
+{
+	ce_dispact_t dispact, action;
+	cpu_t *cp;
+	uchar_t dtcrinfo, disp;
+	int ptnrtype;
+
+	if (!ce_disp_inited || panicstr || ce_xdiag_off) {
+		ce_xdiag_drops++;
+		return (0);
+	} else if (!aflt->flt_in_memory) {
+		ce_xdiag_drops++;
+		CE_XDIAG_SETSKIPCODE(aflt->flt_disp, CE_XDIAG_SKIP_NOTMEM);
+		return (0);
+	}
+
+	dtcrinfo = CE_XDIAG_DTCRINFO(aflt->flt_disp);
+
+	/*
+	 * Some correctable events are not scrubbed/classified, such as those
+	 * noticed at the tail of cpu_deferred_error.  So if there is no
+	 * initial detector classification go no further.
+	 */
+	if (!CE_XDIAG_EXT_ALG_APPLIED(dtcrinfo)) {
+		ce_xdiag_drops++;
+		CE_XDIAG_SETSKIPCODE(aflt->flt_disp, CE_XDIAG_SKIP_NOSCRUB);
+		return (0);
+	}
+
+	dispact = CE_DISPACT(ce_disp_table,
+	    CE_XDIAG_AFARMATCHED(dtcrinfo),
+	    CE_XDIAG_STATE(dtcrinfo),
+	    CE_XDIAG_CE1SEEN(dtcrinfo),
+	    CE_XDIAG_CE2SEEN(dtcrinfo));
+
+
+	action = CE_ACT(dispact);	/* bad lookup caught below */
+#ifdef DEBUG
+	if (ce_xdiag_forceaction != 0)
+		action = ce_xdiag_forceaction;
+#endif
+
+	switch (action) {
+	case CE_ACT_LKYCHK: {
+		caddr_t ndata;
+		errorq_elem_t *neqep;
+		struct async_flt *ecc;
+		ce_lkychk_cb_t *cbargp;
+
+		if ((ndata = errorq_elem_dup(eqp, eqep, &neqep)) == NULL) {
+			ce_xdiag_lkydrops++;
+			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+			    CE_XDIAG_SKIP_DUPFAIL);
+			break;
+		}
+		ecc = (struct async_flt *)(ndata + afltoffset);
+
+		ASSERT(ecc->flt_class == CPU_FAULT ||
+		    ecc->flt_class == BUS_FAULT);
+		ecc->flt_class = (ecc->flt_class == CPU_FAULT) ?
+		    RECIRC_CPU_FAULT : RECIRC_BUS_FAULT;
+
+		cbargp = kmem_alloc(sizeof (ce_lkychk_cb_t), KM_SLEEP);
+		cbargp->lkycb_aflt = ecc;
+		cbargp->lkycb_eqp = eqp;
+		cbargp->lkycb_eqep = neqep;
+
+		(void) timeout((void (*)(void *))ce_lkychk_cb,
+		    (void *)cbargp, drv_usectohz(cpu_ce_lkychk_timeout_usec));
+		return (1);
+	}
+
+	case CE_ACT_PTNRCHK:
+		kpreempt_disable();	/* stop cpu list changing */
+		if ((cp = ce_ptnr_select(aflt, 0, &ptnrtype)) != NULL) {
+			xc_one(cp->cpu_id, (xcfunc_t *)ce_ptnrchk_xc,
+			    (uint64_t)aflt, (uint64_t)&disp);
+			CE_XDIAG_SETPTNRINFO(aflt->flt_disp, disp);
+			CE_XDIAG_SETPTNRID(aflt->flt_disp, cp->cpu_id);
+			CE_XDIAG_SETPTNRTYPE(aflt->flt_disp, ptnrtype);
+		} else if (ncpus > 1) {
+			ce_xdiag_ptnrdrops++;
+			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+			    CE_XDIAG_SKIP_NOPTNR);
+		} else {
+			ce_xdiag_ptnrdrops++;
+			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+			    CE_XDIAG_SKIP_UNIPROC);
+		}
+		kpreempt_enable();
+		break;
+
+	case CE_ACT_DONE:
+		break;
+
+	case CE_ACT(CE_DISP_BAD):
+	default:
+#ifdef DEBUG
+		cmn_err(CE_PANIC, "ce_scrub_post: Bad action '%d'", action);
+#endif
+		ce_xdiag_bad++;
+		CE_XDIAG_SETSKIPCODE(aflt->flt_disp, CE_XDIAG_SKIP_ACTBAD);
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * We route all errors through a single switch statement.
+ */
+void
+cpu_ue_log_err(struct async_flt *aflt)
+{
+	switch (aflt->flt_class) {
+	case CPU_FAULT:
+		cpu_ereport_init(aflt);
+		if (cpu_async_log_err(aflt, NULL))
+			cpu_ereport_post(aflt);
+		break;
+
+	case BUS_FAULT:
+		bus_async_log_err(aflt);
+		break;
+
+	default:
+		cmn_err(CE_WARN, "discarding async error %p with invalid "
+		    "fault class (0x%x)", (void *)aflt, aflt->flt_class);
+		return;
+	}
+}
+
+/*
+ * Routine for panic hook callback from panic_idle().
+ */
+void
+cpu_async_panic_callb(void)
+{
+	ch_async_flt_t ch_flt;
+	struct async_flt *aflt;
+	ch_cpu_errors_t cpu_error_regs;
+	uint64_t afsr_errs;
+
+	get_cpu_error_state(&cpu_error_regs);
+
+	afsr_errs = (cpu_error_regs.afsr & C_AFSR_ALL_ERRS) |
+	    (cpu_error_regs.afsr_ext & C_AFSR_EXT_L3_ERRS);
+
+	if (afsr_errs) {
+
+		bzero(&ch_flt, sizeof (ch_async_flt_t));
+		aflt = (struct async_flt *)&ch_flt;
+		aflt->flt_id = gethrtime_waitfree();
+		aflt->flt_bus_id = getprocessorid();
+		aflt->flt_inst = CPU->cpu_id;
+		aflt->flt_stat = cpu_error_regs.afsr;
+		aflt->flt_addr = cpu_error_regs.afar;
+		aflt->flt_prot = AFLT_PROT_NONE;
+		aflt->flt_class = CPU_FAULT;
+		aflt->flt_priv = ((cpu_error_regs.afsr & C_AFSR_PRIV) != 0);
+		aflt->flt_panic = 1;
+		ch_flt.afsr_ext = cpu_error_regs.afsr_ext;
+		ch_flt.afsr_errs = afsr_errs;
+#if defined(SERRANO)
+		ch_flt.afar2 = cpu_error_regs.afar2;
+#endif	/* SERRANO */
+		(void) cpu_queue_events(&ch_flt, NULL, afsr_errs, NULL);
+	}
+}
+
+/*
+ * Routine to convert a syndrome into a syndrome code.
+ */
+static int
+synd_to_synd_code(int synd_status, ushort_t synd, uint64_t afsr_bit)
+{
+	if (synd_status == AFLT_STAT_INVALID)
+		return (-1);
+
+	/*
+	 * Use the syndrome to index the appropriate syndrome table,
+	 * to get the code indicating which bit(s) is(are) bad.
+	 */
+	if (afsr_bit &
+	    (C_AFSR_MSYND_ERRS | C_AFSR_ESYND_ERRS | C_AFSR_EXT_ESYND_ERRS)) {
+		if (afsr_bit & C_AFSR_MSYND_ERRS) {
+#if defined(JALAPENO) || defined(SERRANO)
+			if ((synd == 0) || (synd >= BSYND_TBL_SIZE))
+				return (-1);
+			else
+				return (BPAR0 + synd);
+#else /* JALAPENO || SERRANO */
+			if ((synd == 0) || (synd >= MSYND_TBL_SIZE))
+				return (-1);
+			else
+				return (mtag_syndrome_tab[synd]);
+#endif /* JALAPENO || SERRANO */
+		} else {
+			if ((synd == 0) || (synd >= ESYND_TBL_SIZE))
+				return (-1);
+			else
+				return (ecc_syndrome_tab[synd]);
+		}
+	} else {
+		return (-1);
+	}
+}
+
+/*
+ * Routine to return a string identifying the physical name
+ * associated with a memory/cache error.
+ */
+int
+cpu_get_mem_unum(int synd_status, ushort_t flt_synd, uint64_t flt_stat,
+    uint64_t flt_addr, int flt_bus_id, int flt_in_memory,
+    ushort_t flt_status, char *buf, int buflen, int *lenp)
+{
+	int synd_code;
+	int ret;
+
+	/*
+	 * An AFSR of -1 defaults to a memory syndrome.
+	 */
+	if (flt_stat == (uint64_t)-1)
+		flt_stat = C_AFSR_CE;
+
+	synd_code = synd_to_synd_code(synd_status, flt_synd, flt_stat);
+
+	/*
+	 * Syndrome code must be either a single-bit error code
+	 * (0...143) or -1 for unum lookup.
+	 */
+	if (synd_code < 0 || synd_code >= M2)
+		synd_code = -1;
+	if (&plat_get_mem_unum) {
+		if ((ret = plat_get_mem_unum(synd_code, flt_addr, flt_bus_id,
+		    flt_in_memory, flt_status, buf, buflen, lenp)) != 0) {
+			buf[0] = '\0';
+			*lenp = 0;
+		}
+
+		return (ret);
+	}
+
+	return (ENOTSUP);
+}
+
+/*
+ * Wrapper for cpu_get_mem_unum() routine that takes an
+ * async_flt struct rather than explicit arguments.
+ */
+int
+cpu_get_mem_unum_aflt(int synd_status, struct async_flt *aflt,
+    char *buf, int buflen, int *lenp)
+{
+	/*
+	 * If we come thru here for an IO bus error aflt->flt_stat will
+	 * not be the CPU AFSR, and we pass in a -1 to cpu_get_mem_unum()
+	 * so it will interpret this as a memory error.
+	 */
+	return (cpu_get_mem_unum(synd_status, aflt->flt_synd,
+	    (aflt->flt_class == BUS_FAULT) ?
+	    (uint64_t)-1 : ((ch_async_flt_t *)(aflt))->afsr_errs,
+	    aflt->flt_addr, aflt->flt_bus_id, aflt->flt_in_memory,
+	    aflt->flt_status, buf, buflen, lenp));
+}
+
+/*
+ * This routine is a more generic interface to cpu_get_mem_unum()
+ * that may be used by other modules (e.g. mm).
+ */
+int
+cpu_get_mem_name(uint64_t synd, uint64_t *afsr, uint64_t afar,
+    char *buf, int buflen, int *lenp)
+{
+	int synd_status, flt_in_memory, ret;
+	ushort_t flt_status = 0;
+	char unum[UNUM_NAMLEN];
+
+	/*
+	 * Check for an invalid address.
+	 */
+	if (afar == (uint64_t)-1)
+		return (ENXIO);
+
+	if (synd == (uint64_t)-1)
+		synd_status = AFLT_STAT_INVALID;
+	else
+		synd_status = AFLT_STAT_VALID;
+
+	flt_in_memory = (*afsr & C_AFSR_MEMORY) &&
+	    pf_is_memory(afar >> MMU_PAGESHIFT);
+
+	/*
+	 * Need to turn on ECC_ECACHE for plat_get_mem_unum().
+	 * For Panther, L2$ is not external, so we don't want to
+	 * generate an E$ unum for those errors.
+	 */
+	if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+		if (*(afsr + 1) & C_AFSR_EXT_L3_ERRS)
+			flt_status |= ECC_ECACHE;
+	} else {
+		if (*afsr & C_AFSR_ECACHE)
+			flt_status |= ECC_ECACHE;
+	}
+
+	ret = cpu_get_mem_unum(synd_status, (ushort_t)synd, *afsr, afar,
+	    CPU->cpu_id, flt_in_memory, flt_status, unum, UNUM_NAMLEN, lenp);
+	if (ret != 0)
+		return (ret);
+
+	if (*lenp >= buflen)
+		return (ENAMETOOLONG);
+
+	(void) strncpy(buf, unum, buflen);
+
+	return (0);
+}
+
+/*
+ * Routine to return memory information associated
+ * with a physical address and syndrome.
+ */
+int
+cpu_get_mem_info(uint64_t synd, uint64_t afar,
+    uint64_t *mem_sizep, uint64_t *seg_sizep, uint64_t *bank_sizep,
+    int *segsp, int *banksp, int *mcidp)
+{
+	int synd_status, synd_code;
+
+	if (afar == (uint64_t)-1)
+		return (ENXIO);
+
+	if (synd == (uint64_t)-1)
+		synd_status = AFLT_STAT_INVALID;
+	else
+		synd_status = AFLT_STAT_VALID;
+
+	synd_code = synd_to_synd_code(synd_status, synd, C_AFSR_CE);
+
+	if (p2get_mem_info != NULL)
+		return ((p2get_mem_info)(synd_code, afar,
+			mem_sizep, seg_sizep, bank_sizep,
+			segsp, banksp, mcidp));
+	else
+		return (ENOTSUP);
+}
+
+/*
+ * Routine to return a string identifying the physical
+ * name associated with a cpuid.
+ */
+int
+cpu_get_cpu_unum(int cpuid, char *buf, int buflen, int *lenp)
+{
+	int ret;
+	char unum[UNUM_NAMLEN];
+
+	if (&plat_get_cpu_unum) {
+		if ((ret = plat_get_cpu_unum(cpuid, unum, UNUM_NAMLEN, lenp))
+		    != 0)
+			return (ret);
+	} else {
+		return (ENOTSUP);
+	}
+
+	if (*lenp >= buflen)
+		return (ENAMETOOLONG);
+
+	(void) strncpy(buf, unum, buflen);
+
+	return (0);
+}
+
+/*
+ * This routine exports the name buffer size.
+ */
+size_t
+cpu_get_name_bufsize()
+{
+	return (UNUM_NAMLEN);
+}
+
+/*
+ * Historical function, apparantly not used.
+ */
+/* ARGSUSED */
+void
+cpu_read_paddr(struct async_flt *ecc, short verbose, short ce_err)
+{}
+
+/*
+ * Historical function only called for SBus errors in debugging.
+ */
+/*ARGSUSED*/
+void
+read_ecc_data(struct async_flt *aflt, short verbose, short ce_err)
+{}
+
+/*
+ * Clear the AFSR sticky bits.  The routine returns a non-zero value if
+ * any of the AFSR's sticky errors are detected.  If a non-null pointer to
+ * an async fault structure argument is passed in, the captured error state
+ * (AFSR, AFAR) info will be returned in the structure.
+ */
+int
+clear_errors(ch_async_flt_t *ch_flt)
+{
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+	ch_cpu_errors_t	cpu_error_regs;
+
+	get_cpu_error_state(&cpu_error_regs);
+
+	if (ch_flt != NULL) {
+		aflt->flt_stat = cpu_error_regs.afsr & C_AFSR_MASK;
+		aflt->flt_addr = cpu_error_regs.afar;
+		ch_flt->afsr_ext = cpu_error_regs.afsr_ext;
+		ch_flt->afsr_errs = (cpu_error_regs.afsr & C_AFSR_ALL_ERRS) |
+		    (cpu_error_regs.afsr_ext & C_AFSR_EXT_ALL_ERRS);
+#if defined(SERRANO)
+		ch_flt->afar2 = cpu_error_regs.afar2;
+#endif	/* SERRANO */
+	}
+
+	set_cpu_error_state(&cpu_error_regs);
+
+	return (((cpu_error_regs.afsr & C_AFSR_ALL_ERRS) |
+	    (cpu_error_regs.afsr_ext & C_AFSR_EXT_ALL_ERRS)) != 0);
+}
+
+/*
+ * Clear any AFSR error bits, and check for persistence.
+ *
+ * It would be desirable to also insist that syndrome match.  PCI handling
+ * has already filled flt_synd.  For errors trapped by CPU we only fill
+ * flt_synd when we queue the event, so we do not have a valid flt_synd
+ * during initial classification (it is valid if we're called as part of
+ * subsequent low-pil additional classification attempts).  We could try
+ * to determine which syndrome to use: we know we're only called for
+ * CE/RCE (Jalapeno & Serrano) and CE/EMC (others) so the syndrome to use
+ * would be esynd/none and esynd/msynd, respectively.  If that is
+ * implemented then what do we do in the case that we do experience an
+ * error on the same afar but with different syndrome?  At the very least
+ * we should count such occurences.  Anyway, for now, we'll leave it as
+ * it has been for ages.
+ */
+static int
+clear_ecc(struct async_flt *aflt)
+{
+	ch_cpu_errors_t	cpu_error_regs;
+
+	/*
+	 * Snapshot the AFSR and AFAR and clear any errors
+	 */
+	get_cpu_error_state(&cpu_error_regs);
+	set_cpu_error_state(&cpu_error_regs);
+
+	/*
+	 * If any of the same memory access error bits are still on and
+	 * the AFAR matches, return that the error is persistent.
+	 */
+	return ((cpu_error_regs.afsr & (C_AFSR_MEMORY & aflt->flt_stat)) != 0 &&
+	    cpu_error_regs.afar == aflt->flt_addr);
+}
+
+/*
+ * Turn off all cpu error detection, normally only used for panics.
+ */
+void
+cpu_disable_errors(void)
+{
+	xt_all(set_error_enable_tl1, EN_REG_DISABLE, EER_SET_ABSOLUTE);
+}
+
+/*
+ * Enable errors.
+ */
+void
+cpu_enable_errors(void)
+{
+	xt_all(set_error_enable_tl1, EN_REG_ENABLE, EER_SET_ABSOLUTE);
+}
+
+/*
+ * Flush the entire ecache using displacement flush by reading through a
+ * physical address range twice as large as the Ecache.
+ */
+void
+cpu_flush_ecache(void)
+{
+	flush_ecache(ecache_flushaddr, cpunodes[CPU->cpu_id].ecache_size,
+	    cpunodes[CPU->cpu_id].ecache_linesize);
+}
+
+/*
+ * Return CPU E$ set size - E$ size divided by the associativity.
+ * We use this function in places where the CPU_PRIVATE ptr may not be
+ * initialized yet.  Note that for send_mondo and in the Ecache scrubber,
+ * we're guaranteed that CPU_PRIVATE is initialized.  Also, cpunodes is set
+ * up before the kernel switches from OBP's to the kernel's trap table, so
+ * we don't have to worry about cpunodes being unitialized.
+ */
+int
+cpu_ecache_set_size(struct cpu *cp)
+{
+	if (CPU_PRIVATE(cp))
+		return (CPU_PRIVATE_VAL(cp, chpr_ec_set_size));
+
+	return (cpunodes[cp->cpu_id].ecache_size / cpu_ecache_nway());
+}
+
+/*
+ * Flush Ecache line.
+ * Uses ASI_EC_DIAG for Cheetah+ and Jalapeno.
+ * Uses normal displacement flush for Cheetah.
+ */
+static void
+cpu_flush_ecache_line(ch_async_flt_t *ch_flt)
+{
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+	int ec_set_size = cpu_ecache_set_size(CPU);
+
+	ecache_flush_line(aflt->flt_addr, ec_set_size);
+}
+
+/*
+ * Scrub physical address.
+ * Scrub code is different depending upon whether this a Cheetah+ with 2-way
+ * Ecache or direct-mapped Ecache.
+ */
+static void
+cpu_scrubphys(struct async_flt *aflt)
+{
+	int ec_set_size = cpu_ecache_set_size(CPU);
+
+	scrubphys(aflt->flt_addr, ec_set_size);
+}
+
+/*
+ * Clear physical address.
+ * Scrub code is different depending upon whether this a Cheetah+ with 2-way
+ * Ecache or direct-mapped Ecache.
+ */
+void
+cpu_clearphys(struct async_flt *aflt)
+{
+	int lsize = cpunodes[CPU->cpu_id].ecache_linesize;
+	int ec_set_size = cpu_ecache_set_size(CPU);
+
+
+	clearphys(P2ALIGN(aflt->flt_addr, lsize), ec_set_size, lsize);
+}
+
+#if defined(CPU_IMP_ECACHE_ASSOC)
+/*
+ * Check for a matching valid line in all the sets.
+ * If found, return set# + 1. Otherwise return 0.
+ */
+static int
+cpu_ecache_line_valid(ch_async_flt_t *ch_flt)
+{
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+	int totalsize = cpunodes[CPU->cpu_id].ecache_size;
+	int ec_set_size = cpu_ecache_set_size(CPU);
+	ch_ec_data_t *ecp = &ch_flt->flt_diag_data.chd_ec_data[0];
+	int nway = cpu_ecache_nway();
+	int i;
+
+	for (i = 0; i < nway; i++, ecp++) {
+		if (!cpu_ectag_line_invalid(totalsize, ecp->ec_tag) &&
+		    (aflt->flt_addr & P2ALIGN(C_AFAR_PA, ec_set_size)) ==
+		    cpu_ectag_to_pa(ec_set_size, ecp->ec_tag))
+			return (i+1);
+	}
+	return (0);
+}
+#endif /* CPU_IMP_ECACHE_ASSOC */
+
+/*
+ * Check whether a line in the given logout info matches the specified
+ * fault address.  If reqval is set then the line must not be Invalid.
+ * Returns 0 on failure;  on success (way + 1) is returned an *level is
+ * set to 2 for l2$ or 3 for l3$.
+ */
+static int
+cpu_matching_ecache_line(uint64_t faddr, void *data, int reqval, int *level)
+{
+	ch_diag_data_t *cdp = data;
+	ch_ec_data_t *ecp;
+	int totalsize, ec_set_size;
+	int i, ways;
+	int match = 0;
+	int tagvalid;
+	uint64_t addr, tagpa;
+	int ispanther = IS_PANTHER(cpunodes[CPU->cpu_id].implementation);
+
+	/*
+	 * Check the l2$ logout data
+	 */
+	if (ispanther) {
+		ecp = &cdp->chd_l2_data[0];
+		ec_set_size = PN_L2_SET_SIZE;
+		ways = PN_L2_NWAYS;
+	} else {
+		ecp = &cdp->chd_ec_data[0];
+		ec_set_size = cpu_ecache_set_size(CPU);
+		ways = cpu_ecache_nway();
+		totalsize = cpunodes[CPU->cpu_id].ecache_size;
+	}
+	/* remove low order PA bits from fault address not used in PA tag */
+	addr = faddr & P2ALIGN(C_AFAR_PA, ec_set_size);
+	for (i = 0; i < ways; i++, ecp++) {
+		if (ispanther) {
+			tagpa = PN_L2TAG_TO_PA(ecp->ec_tag);
+			tagvalid = !PN_L2_LINE_INVALID(ecp->ec_tag);
+		} else {
+			tagpa = cpu_ectag_to_pa(ec_set_size, ecp->ec_tag);
+			tagvalid = !cpu_ectag_line_invalid(totalsize,
+			    ecp->ec_tag);
+		}
+		if (tagpa == addr && (!reqval || tagvalid)) {
+			match = i + 1;
+			*level = 2;
+			break;
+		}
+	}
+
+	if (match || !ispanther)
+		return (match);
+
+	/* For Panther we also check the l3$ */
+	ecp = &cdp->chd_ec_data[0];
+	ec_set_size = PN_L3_SET_SIZE;
+	ways = PN_L3_NWAYS;
+	addr = faddr & P2ALIGN(C_AFAR_PA, ec_set_size);
+
+	for (i = 0; i < ways; i++, ecp++) {
+		if (PN_L3TAG_TO_PA(ecp->ec_tag) == addr && (!reqval ||
+		    !PN_L3_LINE_INVALID(ecp->ec_tag))) {
+			match = i + 1;
+			*level = 3;
+			break;
+		}
+	}
+
+	return (match);
+}
+
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+/*
+ * Record information related to the source of an Dcache Parity Error.
+ */
+static void
+cpu_dcache_parity_info(ch_async_flt_t *ch_flt)
+{
+	int dc_set_size = dcache_size / CH_DCACHE_NWAY;
+	int index;
+
+	/*
+	 * Since instruction decode cannot be done at high PIL
+	 * just examine the entire Dcache to locate the error.
+	 */
+	if (ch_flt->parity_data.dpe.cpl_lcnt == 0) {
+		ch_flt->parity_data.dpe.cpl_way = -1;
+		ch_flt->parity_data.dpe.cpl_off = -1;
+	}
+	for (index = 0; index < dc_set_size; index += dcache_linesize)
+		cpu_dcache_parity_check(ch_flt, index);
+}
+
+/*
+ * Check all ways of the Dcache at a specified index for good parity.
+ */
+static void
+cpu_dcache_parity_check(ch_async_flt_t *ch_flt, int index)
+{
+	int dc_set_size = dcache_size / CH_DCACHE_NWAY;
+	uint64_t parity_bits, pbits, data_word;
+	static int parity_bits_popc[] = { 0, 1, 1, 0 };
+	int way, word, data_byte;
+	ch_dc_data_t *dcp = &ch_flt->parity_data.dpe.cpl_dc[0];
+	ch_dc_data_t tmp_dcp;
+
+	for (way = 0; way < CH_DCACHE_NWAY; way++, dcp++) {
+		/*
+		 * Perform diagnostic read.
+		 */
+		get_dcache_dtag(index + way * dc_set_size,
+				(uint64_t *)&tmp_dcp);
+
+		/*
+		 * Check tag for even parity.
+		 * Sum of 1 bits (including parity bit) should be even.
+		 */
+		if (popc64(tmp_dcp.dc_tag & CHP_DCTAG_PARMASK) & 1) {
+			/*
+			 * If this is the first error log detailed information
+			 * about it and check the snoop tag. Otherwise just
+			 * record the fact that we found another error.
+			 */
+			if (ch_flt->parity_data.dpe.cpl_lcnt == 0) {
+				ch_flt->parity_data.dpe.cpl_way = way;
+				ch_flt->parity_data.dpe.cpl_cache =
+				    CPU_DC_PARITY;
+				ch_flt->parity_data.dpe.cpl_tag |= CHP_DC_TAG;
+
+				if (popc64(tmp_dcp.dc_sntag &
+						CHP_DCSNTAG_PARMASK) & 1) {
+					ch_flt->parity_data.dpe.cpl_tag |=
+								CHP_DC_SNTAG;
+					ch_flt->parity_data.dpe.cpl_lcnt++;
+				}
+
+				bcopy(&tmp_dcp, dcp, sizeof (ch_dc_data_t));
+			}
+
+			ch_flt->parity_data.dpe.cpl_lcnt++;
+		}
+
+		if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+			/*
+			 * Panther has more parity bits than the other
+			 * processors for covering dcache data and so each
+			 * byte of data in each word has its own parity bit.
+			 */
+			parity_bits = tmp_dcp.dc_pn_data_parity;
+			for (word = 0; word < 4; word++) {
+				data_word = tmp_dcp.dc_data[word];
+				pbits = parity_bits & PN_DC_DATA_PARITY_MASK;
+				for (data_byte = 0; data_byte < 8;
+				    data_byte++) {
+					if (((popc64(data_word &
+					    PN_DC_DATA_PARITY_MASK)) & 1) ^
+					    (pbits & 1)) {
+						cpu_record_dc_data_parity(
+						ch_flt, dcp, &tmp_dcp, way,
+						word);
+					}
+					pbits >>= 1;
+					data_word >>= 8;
+				}
+				parity_bits >>= 8;
+			}
+		} else {
+			/*
+			 * Check data array for even parity.
+			 * The 8 parity bits are grouped into 4 pairs each
+			 * of which covers a 64-bit word.  The endianness is
+			 * reversed -- the low-order parity bits cover the
+			 * high-order data words.
+			 */
+			parity_bits = tmp_dcp.dc_utag >> 8;
+			for (word = 0; word < 4; word++) {
+				pbits = (parity_bits >> (6 - word * 2)) & 3;
+				if ((popc64(tmp_dcp.dc_data[word]) +
+				    parity_bits_popc[pbits]) & 1) {
+					cpu_record_dc_data_parity(ch_flt, dcp,
+					    &tmp_dcp, way, word);
+				}
+			}
+		}
+	}
+}
+
+static void
+cpu_record_dc_data_parity(ch_async_flt_t *ch_flt,
+    ch_dc_data_t *dest_dcp, ch_dc_data_t *src_dcp, int way, int word)
+{
+	/*
+	 * If this is the first error log detailed information about it.
+	 * Otherwise just record the fact that we found another error.
+	 */
+	if (ch_flt->parity_data.dpe.cpl_lcnt == 0) {
+		ch_flt->parity_data.dpe.cpl_way = way;
+		ch_flt->parity_data.dpe.cpl_cache = CPU_DC_PARITY;
+		ch_flt->parity_data.dpe.cpl_off = word * 8;
+		bcopy(src_dcp, dest_dcp, sizeof (ch_dc_data_t));
+	}
+	ch_flt->parity_data.dpe.cpl_lcnt++;
+}
+
+/*
+ * Record information related to the source of an Icache Parity Error.
+ *
+ * Called with the Icache disabled so any diagnostic accesses are safe.
+ */
+static void
+cpu_icache_parity_info(ch_async_flt_t *ch_flt)
+{
+	int	ic_set_size;
+	int	ic_linesize;
+	int	index;
+
+	if (CPU_PRIVATE(CPU)) {
+		ic_set_size = CPU_PRIVATE_VAL(CPU, chpr_icache_size) /
+		    CH_ICACHE_NWAY;
+		ic_linesize = CPU_PRIVATE_VAL(CPU, chpr_icache_linesize);
+	} else {
+		ic_set_size = icache_size / CH_ICACHE_NWAY;
+		ic_linesize = icache_linesize;
+	}
+
+	ch_flt->parity_data.ipe.cpl_way = -1;
+	ch_flt->parity_data.ipe.cpl_off = -1;
+
+	for (index = 0; index < ic_set_size; index += ic_linesize)
+		cpu_icache_parity_check(ch_flt, index);
+}
+
+/*
+ * Check all ways of the Icache at a specified index for good parity.
+ */
+static void
+cpu_icache_parity_check(ch_async_flt_t *ch_flt, int index)
+{
+	uint64_t parmask, pn_inst_parity;
+	int ic_set_size;
+	int ic_linesize;
+	int flt_index, way, instr, num_instr;
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+	ch_ic_data_t *icp = &ch_flt->parity_data.ipe.cpl_ic[0];
+	ch_ic_data_t tmp_icp;
+
+	if (CPU_PRIVATE(CPU)) {
+		ic_set_size = CPU_PRIVATE_VAL(CPU, chpr_icache_size) /
+		    CH_ICACHE_NWAY;
+		ic_linesize = CPU_PRIVATE_VAL(CPU, chpr_icache_linesize);
+	} else {
+		ic_set_size = icache_size / CH_ICACHE_NWAY;
+		ic_linesize = icache_linesize;
+	}
+
+	/*
+	 * Panther has twice as many instructions per icache line and the
+	 * instruction parity bit is in a different location.
+	 */
+	if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+		num_instr = PN_IC_DATA_REG_SIZE / sizeof (uint64_t);
+		pn_inst_parity = PN_ICDATA_PARITY_BIT_MASK;
+	} else {
+		num_instr = CH_IC_DATA_REG_SIZE / sizeof (uint64_t);
+		pn_inst_parity = 0;
+	}
+
+	/*
+	 * Index at which we expect to find the parity error.
+	 */
+	flt_index = P2ALIGN(aflt->flt_addr % ic_set_size, ic_linesize);
+
+	for (way = 0; way < CH_ICACHE_NWAY; way++, icp++) {
+		/*
+		 * Diagnostic reads expect address argument in ASI format.
+		 */
+		get_icache_dtag(2 * (index + way * ic_set_size),
+				(uint64_t *)&tmp_icp);
+
+		/*
+		 * If this is the index in which we expect to find the
+		 * error log detailed information about each of the ways.
+		 * This information will be displayed later if we can't
+		 * determine the exact way in which the error is located.
+		 */
+		if (flt_index == index)
+			bcopy(&tmp_icp, icp, sizeof (ch_ic_data_t));
+
+		/*
+		 * Check tag for even parity.
+		 * Sum of 1 bits (including parity bit) should be even.
+		 */
+		if (popc64(tmp_icp.ic_patag & CHP_ICPATAG_PARMASK) & 1) {
+			/*
+			 * If this way is the one in which we expected
+			 * to find the error record the way and check the
+			 * snoop tag. Otherwise just record the fact we
+			 * found another error.
+			 */
+			if (flt_index == index) {
+				ch_flt->parity_data.ipe.cpl_way = way;
+				ch_flt->parity_data.ipe.cpl_tag |= CHP_IC_TAG;
+
+				if (popc64(tmp_icp.ic_sntag &
+						CHP_ICSNTAG_PARMASK) & 1) {
+					ch_flt->parity_data.ipe.cpl_tag |=
+								CHP_IC_SNTAG;
+					ch_flt->parity_data.ipe.cpl_lcnt++;
+				}
+
+			}
+			ch_flt->parity_data.ipe.cpl_lcnt++;
+			continue;
+		}
+
+		/*
+		 * Check instruction data for even parity.
+		 * Bits participating in parity differ for PC-relative
+		 * versus non-PC-relative instructions.
+		 */
+		for (instr = 0; instr < num_instr; instr++) {
+			parmask = (tmp_icp.ic_data[instr] &
+					CH_ICDATA_PRED_ISPCREL) ?
+				(CHP_ICDATA_PCREL_PARMASK | pn_inst_parity) :
+				(CHP_ICDATA_NPCREL_PARMASK | pn_inst_parity);
+			if (popc64(tmp_icp.ic_data[instr] & parmask) & 1) {
+				/*
+				 * If this way is the one in which we expected
+				 * to find the error record the way and offset.
+				 * Otherwise just log the fact we found another
+				 * error.
+				 */
+				if (flt_index == index) {
+					ch_flt->parity_data.ipe.cpl_way = way;
+					ch_flt->parity_data.ipe.cpl_off =
+								instr * 4;
+				}
+				ch_flt->parity_data.ipe.cpl_lcnt++;
+				continue;
+			}
+		}
+	}
+}
+
+/*
+ * Record information related to the source of an Pcache Parity Error.
+ */
+static void
+cpu_pcache_parity_info(ch_async_flt_t *ch_flt)
+{
+	int pc_set_size = CH_PCACHE_SIZE / CH_PCACHE_NWAY;
+	int index;
+
+	/*
+	 * Since instruction decode cannot be done at high PIL just
+	 * examine the entire Pcache to check for any parity errors.
+	 */
+	if (ch_flt->parity_data.dpe.cpl_lcnt == 0) {
+		ch_flt->parity_data.dpe.cpl_way = -1;
+		ch_flt->parity_data.dpe.cpl_off = -1;
+	}
+	for (index = 0; index < pc_set_size; index += CH_PCACHE_LSIZE)
+		cpu_pcache_parity_check(ch_flt, index);
+}
+
+/*
+ * Check all ways of the Pcache at a specified index for good parity.
+ */
+static void
+cpu_pcache_parity_check(ch_async_flt_t *ch_flt, int index)
+{
+	int pc_set_size = CH_PCACHE_SIZE / CH_PCACHE_NWAY;
+	int pc_data_words = CH_PC_DATA_REG_SIZE / sizeof (uint64_t);
+	int way, word, pbit, parity_bits;
+	ch_pc_data_t *pcp = &ch_flt->parity_data.dpe.cpl_pc[0];
+	ch_pc_data_t tmp_pcp;
+
+	for (way = 0; way < CH_PCACHE_NWAY; way++, pcp++) {
+		/*
+		 * Perform diagnostic read.
+		 */
+		get_pcache_dtag(index + way * pc_set_size,
+				(uint64_t *)&tmp_pcp);
+		/*
+		 * Check data array for odd parity. There are 8 parity
+		 * bits (bits 57:50 of ASI_PCACHE_STATUS_DATA) and each
+		 * of those bits covers exactly 8 bytes of the data
+		 * array:
+		 *
+		 *	parity bit	P$ data bytes covered
+		 *	----------	---------------------
+		 *	50		63:56
+		 *	51		55:48
+		 *	52		47:40
+		 *	53		39:32
+		 *	54		31:24
+		 *	55		23:16
+		 *	56		15:8
+		 *	57		7:0
+		 */
+		parity_bits = PN_PC_PARITY_BITS(tmp_pcp.pc_status);
+		for (word = 0; word < pc_data_words; word++) {
+			pbit = (parity_bits >> (pc_data_words - word - 1)) & 1;
+			if ((popc64(tmp_pcp.pc_data[word]) & 1) ^ pbit) {
+				/*
+				 * If this is the first error log detailed
+				 * information about it. Otherwise just record
+				 * the fact that we found another error.
+				 */
+				if (ch_flt->parity_data.dpe.cpl_lcnt == 0) {
+					ch_flt->parity_data.dpe.cpl_way = way;
+					ch_flt->parity_data.dpe.cpl_cache =
+					    CPU_PC_PARITY;
+					ch_flt->parity_data.dpe.cpl_off =
+					    word * sizeof (uint64_t);
+					bcopy(&tmp_pcp, pcp,
+							sizeof (ch_pc_data_t));
+				}
+				ch_flt->parity_data.dpe.cpl_lcnt++;
+			}
+		}
+	}
+}
+
+
+/*
+ * Add L1 Data cache data to the ereport payload.
+ */
+static void
+cpu_payload_add_dcache(struct async_flt *aflt, nvlist_t *nvl)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+	ch_dc_data_t *dcp;
+	ch_dc_data_t dcdata[CH_DCACHE_NWAY];
+	uint_t nelem;
+	int i, ways_to_check, ways_logged = 0;
+
+	/*
+	 * If this is an D$ fault then there may be multiple
+	 * ways captured in the ch_parity_log_t structure.
+	 * Otherwise, there will be at most one way captured
+	 * in the ch_diag_data_t struct.
+	 * Check each way to see if it should be encoded.
+	 */
+	if (ch_flt->flt_type == CPU_DC_PARITY)
+		ways_to_check = CH_DCACHE_NWAY;
+	else
+		ways_to_check = 1;
+	for (i = 0; i < ways_to_check; i++) {
+		if (ch_flt->flt_type == CPU_DC_PARITY)
+			dcp = &ch_flt->parity_data.dpe.cpl_dc[i];
+		else
+			dcp = &ch_flt->flt_diag_data.chd_dc_data;
+		if (dcp->dc_logflag == DC_LOGFLAG_MAGIC) {
+			bcopy(dcp, &dcdata[ways_logged],
+				sizeof (ch_dc_data_t));
+			ways_logged++;
+		}
+	}
+
+	/*
+	 * Add the dcache data to the payload.
+	 */
+	fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L1D_WAYS,
+	    DATA_TYPE_UINT8, (uint8_t)ways_logged, NULL);
+	if (ways_logged != 0) {
+		nelem = sizeof (ch_dc_data_t) / sizeof (uint64_t) * ways_logged;
+		fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L1D_DATA,
+		    DATA_TYPE_UINT64_ARRAY, nelem, (uint64_t *)dcdata, NULL);
+	}
+}
+
+/*
+ * Add L1 Instruction cache data to the ereport payload.
+ */
+static void
+cpu_payload_add_icache(struct async_flt *aflt, nvlist_t *nvl)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+	ch_ic_data_t *icp;
+	ch_ic_data_t icdata[CH_ICACHE_NWAY];
+	uint_t nelem;
+	int i, ways_to_check, ways_logged = 0;
+
+	/*
+	 * If this is an I$ fault then there may be multiple
+	 * ways captured in the ch_parity_log_t structure.
+	 * Otherwise, there will be at most one way captured
+	 * in the ch_diag_data_t struct.
+	 * Check each way to see if it should be encoded.
+	 */
+	if (ch_flt->flt_type == CPU_IC_PARITY)
+		ways_to_check = CH_ICACHE_NWAY;
+	else
+		ways_to_check = 1;
+	for (i = 0; i < ways_to_check; i++) {
+		if (ch_flt->flt_type == CPU_IC_PARITY)
+			icp = &ch_flt->parity_data.ipe.cpl_ic[i];
+		else
+			icp = &ch_flt->flt_diag_data.chd_ic_data;
+		if (icp->ic_logflag == IC_LOGFLAG_MAGIC) {
+			bcopy(icp, &icdata[ways_logged],
+				sizeof (ch_ic_data_t));
+			ways_logged++;
+		}
+	}
+
+	/*
+	 * Add the icache data to the payload.
+	 */
+	fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L1I_WAYS,
+	    DATA_TYPE_UINT8, (uint8_t)ways_logged, NULL);
+	if (ways_logged != 0) {
+		nelem = sizeof (ch_ic_data_t) / sizeof (uint64_t) * ways_logged;
+		fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L1I_DATA,
+		    DATA_TYPE_UINT64_ARRAY, nelem, (uint64_t *)icdata, NULL);
+	}
+}
+
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+
+/*
+ * Add ecache data to payload.
+ */
+static void
+cpu_payload_add_ecache(struct async_flt *aflt, nvlist_t *nvl)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+	ch_ec_data_t *ecp;
+	ch_ec_data_t ecdata[CHD_EC_DATA_SETS];
+	uint_t nelem;
+	int i, ways_logged = 0;
+
+	/*
+	 * Check each way to see if it should be encoded
+	 * and concatinate it into a temporary buffer.
+	 */
+	for (i = 0; i < CHD_EC_DATA_SETS; i++) {
+		ecp = &ch_flt->flt_diag_data.chd_ec_data[i];
+		if (ecp->ec_logflag == EC_LOGFLAG_MAGIC) {
+			bcopy(ecp, &ecdata[ways_logged],
+				sizeof (ch_ec_data_t));
+			ways_logged++;
+		}
+	}
+
+	/*
+	 * Panther CPUs have an additional level of cache and so
+	 * what we just collected was the L3 (ecache) and not the
+	 * L2 cache.
+	 */
+	if (IS_PANTHER(cpunodes[aflt->flt_inst].implementation)) {
+		/*
+		 * Add the L3 (ecache) data to the payload.
+		 */
+		fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L3_WAYS,
+		    DATA_TYPE_UINT8, (uint8_t)ways_logged, NULL);
+		if (ways_logged != 0) {
+			nelem = sizeof (ch_ec_data_t) /
+			    sizeof (uint64_t) * ways_logged;
+			fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L3_DATA,
+			    DATA_TYPE_UINT64_ARRAY, nelem,
+			    (uint64_t *)ecdata, NULL);
+		}
+
+		/*
+		 * Now collect the L2 cache.
+		 */
+		ways_logged = 0;
+		for (i = 0; i < PN_L2_NWAYS; i++) {
+			ecp = &ch_flt->flt_diag_data.chd_l2_data[i];
+			if (ecp->ec_logflag == EC_LOGFLAG_MAGIC) {
+				bcopy(ecp, &ecdata[ways_logged],
+				    sizeof (ch_ec_data_t));
+				ways_logged++;
+			}
+		}
+	}
+
+	/*
+	 * Add the L2 cache data to the payload.
+	 */
+	fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L2_WAYS,
+	    DATA_TYPE_UINT8, (uint8_t)ways_logged, NULL);
+	if (ways_logged != 0) {
+		nelem = sizeof (ch_ec_data_t) /
+			sizeof (uint64_t) * ways_logged;
+		fm_payload_set(nvl, FM_EREPORT_PAYLOAD_NAME_L2_DATA,
+		    DATA_TYPE_UINT64_ARRAY, nelem,  (uint64_t *)ecdata, NULL);
+	}
+}
+
+/*
+ * Encode the data saved in the ch_async_flt_t struct into
+ * the FM ereport payload.
+ */
+static void
+cpu_payload_add_aflt(struct async_flt *aflt, nvlist_t *payload,
+	nvlist_t *resource, int *afar_status, int *synd_status)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+	*synd_status = AFLT_STAT_INVALID;
+	*afar_status = AFLT_STAT_INVALID;
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_AFSR) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_AFSR,
+		    DATA_TYPE_UINT64, aflt->flt_stat, NULL);
+	}
+
+	if ((aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_AFSR_EXT) &&
+	    IS_PANTHER(cpunodes[aflt->flt_inst].implementation)) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_AFSR_EXT,
+		    DATA_TYPE_UINT64, ch_flt->afsr_ext, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_AFAR_STATUS) {
+		*afar_status = afsr_to_afar_status(ch_flt->afsr_errs,
+		    ch_flt->flt_bit);
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_AFAR_STATUS,
+		    DATA_TYPE_UINT8, (uint8_t)*afar_status, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_AFAR) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_AFAR,
+		    DATA_TYPE_UINT64, aflt->flt_addr, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_PC) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_PC,
+		    DATA_TYPE_UINT64, (uint64_t)aflt->flt_pc, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_TL) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_TL,
+		    DATA_TYPE_UINT8, (uint8_t)aflt->flt_tl, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_TT) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_TT,
+		    DATA_TYPE_UINT8, flt_to_trap_type(aflt), NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_PRIV,
+		    DATA_TYPE_BOOLEAN_VALUE,
+		    (aflt->flt_priv ? B_TRUE : B_FALSE), NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_ME) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_ME,
+		    DATA_TYPE_BOOLEAN_VALUE,
+		    (aflt->flt_stat & C_AFSR_ME) ? B_TRUE : B_FALSE, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_SYND_STATUS) {
+		*synd_status = afsr_to_synd_status(aflt->flt_inst,
+		    ch_flt->afsr_errs, ch_flt->flt_bit);
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_SYND_STATUS,
+		    DATA_TYPE_UINT8, (uint8_t)*synd_status, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_SYND) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_SYND,
+		    DATA_TYPE_UINT16, (uint16_t)aflt->flt_synd, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_ERR_TYPE) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_ERR_TYPE,
+		    DATA_TYPE_STRING, flt_to_error_type(aflt), NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_ERR_DISP) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_ERR_DISP,
+		    DATA_TYPE_UINT64, aflt->flt_disp, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAGS_L2)
+		cpu_payload_add_ecache(aflt, payload);
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_COPYFUNCTION) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_COPYFUNCTION,
+		    DATA_TYPE_UINT8, (uint8_t)aflt->flt_status & 0xff, NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_HOWDETECTED) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_HOWDETECTED,
+		    DATA_TYPE_UINT8, (uint8_t)(aflt->flt_status >> 8), NULL);
+	}
+
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_INSTRBLOCK) {
+		fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_INSTRBLOCK,
+		    DATA_TYPE_UINT32_ARRAY, 16,
+		    (uint32_t *)&ch_flt->flt_fpdata, NULL);
+	}
+
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAGS_L1D)
+		cpu_payload_add_dcache(aflt, payload);
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAGS_L1I)
+		cpu_payload_add_icache(aflt, payload);
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+
+#if defined(CHEETAH_PLUS)
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAGS_L1P)
+		cpu_payload_add_pcache(aflt, payload);
+	if (aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAGS_TLB)
+		cpu_payload_add_tlb(aflt, payload);
+#endif	/* CHEETAH_PLUS */
+	/*
+	 * Create the FMRI that goes into the payload
+	 * and contains the unum info if necessary.
+	 */
+	if ((aflt->flt_payload & FM_EREPORT_PAYLOAD_FLAG_RESOURCE) &&
+	    (*afar_status == AFLT_STAT_VALID)) {
+		char unum[UNUM_NAMLEN];
+		int len;
+
+		if (cpu_get_mem_unum_aflt(*synd_status, aflt, unum,
+		    UNUM_NAMLEN, &len) == 0) {
+			fm_fmri_mem_set(resource, FM_MEM_SCHEME_VERSION,
+			    NULL, unum, NULL);
+			fm_payload_set(payload,
+			    FM_EREPORT_PAYLOAD_NAME_RESOURCE,
+			    DATA_TYPE_NVLIST, resource, NULL);
+		}
+	}
+}
+
+/*
+ * Initialize the way info if necessary.
+ */
+void
+cpu_ereport_init(struct async_flt *aflt)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+	ch_ec_data_t *ecp = &ch_flt->flt_diag_data.chd_ec_data[0];
+	ch_ec_data_t *l2p = &ch_flt->flt_diag_data.chd_l2_data[0];
+	int i;
+
+	/*
+	 * Initialize the info in the CPU logout structure.
+	 * The I$/D$ way information is not initialized here
+	 * since it is captured in the logout assembly code.
+	 */
+	for (i = 0; i < CHD_EC_DATA_SETS; i++)
+		(ecp + i)->ec_way = i;
+
+	for (i = 0; i < PN_L2_NWAYS; i++)
+		(l2p + i)->ec_way = i;
+}
+
+/*
+ * Returns whether fault address is valid for this error bit and
+ * whether the address is "in memory" (i.e. pf_is_memory returns 1).
+ */
+int
+cpu_flt_in_memory(ch_async_flt_t *ch_flt, uint64_t t_afsr_bit)
+{
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+
+	return ((aflt->flt_stat & C_AFSR_MEMORY) &&
+	    afsr_to_afar_status(ch_flt->afsr_errs, t_afsr_bit) ==
+	    AFLT_STAT_VALID &&
+	    pf_is_memory(aflt->flt_addr >> MMU_PAGESHIFT));
+}
+
+static void
+cpu_log_diag_info(ch_async_flt_t *ch_flt)
+{
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+	ch_dc_data_t *dcp = &ch_flt->flt_diag_data.chd_dc_data;
+	ch_ic_data_t *icp = &ch_flt->flt_diag_data.chd_ic_data;
+	ch_ec_data_t *ecp = &ch_flt->flt_diag_data.chd_ec_data[0];
+#if defined(CPU_IMP_ECACHE_ASSOC)
+	int i, nway;
+#endif /* CPU_IMP_ECACHE_ASSOC */
+
+	/*
+	 * Check if the CPU log out captured was valid.
+	 */
+	if (ch_flt->flt_diag_data.chd_afar == LOGOUT_INVALID ||
+	    ch_flt->flt_data_incomplete)
+		return;
+
+#if defined(CPU_IMP_ECACHE_ASSOC)
+	nway = cpu_ecache_nway();
+	i =  cpu_ecache_line_valid(ch_flt);
+	if (i == 0 || i > nway) {
+		for (i = 0; i < nway; i++)
+			ecp[i].ec_logflag = EC_LOGFLAG_MAGIC;
+	} else
+		ecp[i - 1].ec_logflag = EC_LOGFLAG_MAGIC;
+#else /* CPU_IMP_ECACHE_ASSOC */
+	ecp->ec_logflag = EC_LOGFLAG_MAGIC;
+#endif /* CPU_IMP_ECACHE_ASSOC */
+
+#if defined(CHEETAH_PLUS)
+	pn_cpu_log_diag_l2_info(ch_flt);
+#endif /* CHEETAH_PLUS */
+
+	if (CH_DCTAG_MATCH(dcp->dc_tag, aflt->flt_addr)) {
+		dcp->dc_way = CH_DCIDX_TO_WAY(dcp->dc_idx);
+		dcp->dc_logflag = DC_LOGFLAG_MAGIC;
+	}
+
+	if (CH_ICTAG_MATCH(icp, aflt->flt_addr)) {
+		if (IS_PANTHER(cpunodes[aflt->flt_inst].implementation))
+			icp->ic_way = PN_ICIDX_TO_WAY(icp->ic_idx);
+		else
+			icp->ic_way = CH_ICIDX_TO_WAY(icp->ic_idx);
+		icp->ic_logflag = IC_LOGFLAG_MAGIC;
+	}
+}
+
+/*
+ * Cheetah ECC calculation.
+ *
+ * We only need to do the calculation on the data bits and can ignore check
+ * bit and Mtag bit terms in the calculation.
+ */
+static uint64_t ch_ecc_table[9][2] = {
+	/*
+	 * low order 64-bits   high-order 64-bits
+	 */
+	{ 0x46bffffeccd1177f, 0x488800022100014c },
+	{ 0x42fccc81331ff77f, 0x14424f1010249184 },
+	{ 0x8898827c222f1ffe, 0x22c1222808184aaf },
+	{ 0xf7632203e131ccf1, 0xe1241121848292b8 },
+	{ 0x7f5511421b113809, 0x901c88d84288aafe },
+	{ 0x1d49412184882487, 0x8f338c87c044c6ef },
+	{ 0xf552181014448344, 0x7ff8f4443e411911 },
+	{ 0x2189240808f24228, 0xfeeff8cc81333f42 },
+	{ 0x3280008440001112, 0xfee88b337ffffd62 },
+};
+
+/*
+ * 64-bit population count, use well-known popcnt trick.
+ * We could use the UltraSPARC V9 POPC instruction, but some
+ * CPUs including Cheetahplus and Jaguar do not support that
+ * instruction.
+ */
+int
+popc64(uint64_t val)
+{
+	int cnt;
+
+	for (cnt = 0; val != 0; val &= val - 1)
+		cnt++;
+	return (cnt);
+}
+
+/*
+ * Generate the 9 ECC bits for the 128-bit chunk based on the table above.
+ * Note that xor'ing an odd number of 1 bits == 1 and xor'ing an even number
+ * of 1 bits == 0, so we can just use the least significant bit of the popcnt
+ * instead of doing all the xor's.
+ */
+uint32_t
+us3_gen_ecc(uint64_t data_low, uint64_t data_high)
+{
+	int bitno, s;
+	int synd = 0;
+
+	for (bitno = 0; bitno < 9; bitno++) {
+		s = (popc64(data_low & ch_ecc_table[bitno][0]) +
+		    popc64(data_high & ch_ecc_table[bitno][1])) & 1;
+		synd |= (s << bitno);
+	}
+	return (synd);
+
+}
+
+/*
+ * Queue one event based on ecc_type_to_info entry.  If the event has an AFT1
+ * tag associated with it or is a fatal event (aflt_panic set), it is sent to
+ * the UE event queue.  Otherwise it is dispatched to the CE event queue.
+ */
+static void
+cpu_queue_one_event(ch_async_flt_t *ch_flt, char *reason,
+    ecc_type_to_info_t *eccp, ch_diag_data_t *cdp)
+{
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+
+	if (reason &&
+	    strlen(reason) + strlen(eccp->ec_reason) < MAX_REASON_STRING) {
+		(void) strcat(reason, eccp->ec_reason);
+	}
+
+	ch_flt->flt_bit = eccp->ec_afsr_bit;
+	ch_flt->flt_type = eccp->ec_flt_type;
+	if (cdp != NULL && cdp->chd_afar != LOGOUT_INVALID)
+		ch_flt->flt_diag_data = *cdp;
+	else
+		ch_flt->flt_diag_data.chd_afar = LOGOUT_INVALID;
+	aflt->flt_in_memory = cpu_flt_in_memory(ch_flt, ch_flt->flt_bit);
+
+	if (ch_flt->flt_bit & C_AFSR_MSYND_ERRS)
+		aflt->flt_synd = GET_M_SYND(aflt->flt_stat);
+	else if (ch_flt->flt_bit & (C_AFSR_ESYND_ERRS | C_AFSR_EXT_ESYND_ERRS))
+		aflt->flt_synd = GET_E_SYND(aflt->flt_stat);
+	else
+		aflt->flt_synd = 0;
+
+	aflt->flt_payload = eccp->ec_err_payload;
+
+	if (aflt->flt_panic || (eccp->ec_afsr_bit &
+	    (C_AFSR_LEVEL1 | C_AFSR_EXT_LEVEL1)))
+		cpu_errorq_dispatch(eccp->ec_err_class,
+		    (void *)ch_flt, sizeof (ch_async_flt_t), ue_queue,
+		    aflt->flt_panic);
+	else
+		cpu_errorq_dispatch(eccp->ec_err_class,
+		    (void *)ch_flt, sizeof (ch_async_flt_t), ce_queue,
+		    aflt->flt_panic);
+}
+
+/*
+ * Queue events on async event queue one event per error bit.  First we
+ * queue the events that we "expect" for the given trap, then we queue events
+ * that we may not expect.  Return number of events queued.
+ */
+int
+cpu_queue_events(ch_async_flt_t *ch_flt, char *reason, uint64_t t_afsr_errs,
+    ch_cpu_logout_t *clop)
+{
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+	ecc_type_to_info_t *eccp;
+	int nevents = 0;
+	uint64_t primary_afar = aflt->flt_addr, primary_afsr = aflt->flt_stat;
+#if defined(CHEETAH_PLUS)
+	uint64_t orig_t_afsr_errs;
+#endif
+	uint64_t primary_afsr_ext = ch_flt->afsr_ext;
+	uint64_t primary_afsr_errs = ch_flt->afsr_errs;
+	ch_diag_data_t *cdp = NULL;
+
+	t_afsr_errs &= ((C_AFSR_ALL_ERRS & ~C_AFSR_ME) | C_AFSR_EXT_ALL_ERRS);
+
+#if defined(CHEETAH_PLUS)
+	orig_t_afsr_errs = t_afsr_errs;
+
+	/*
+	 * For Cheetah+, log the shadow AFSR/AFAR bits first.
+	 */
+	if (clop != NULL) {
+		/*
+		 * Set the AFSR and AFAR fields to the shadow registers.  The
+		 * flt_addr and flt_stat fields will be reset to the primaries
+		 * below, but the sdw_addr and sdw_stat will stay as the
+		 * secondaries.
+		 */
+		cdp = &clop->clo_sdw_data;
+		aflt->flt_addr = ch_flt->flt_sdw_afar = cdp->chd_afar;
+		aflt->flt_stat = ch_flt->flt_sdw_afsr = cdp->chd_afsr;
+		ch_flt->afsr_ext = ch_flt->flt_sdw_afsr_ext = cdp->chd_afsr_ext;
+		ch_flt->afsr_errs = (cdp->chd_afsr_ext & C_AFSR_EXT_ALL_ERRS) |
+		    (cdp->chd_afsr & C_AFSR_ALL_ERRS);
+
+		/*
+		 * If the primary and shadow AFSR differ, tag the shadow as
+		 * the first fault.
+		 */
+		if ((primary_afar != cdp->chd_afar) ||
+		    (primary_afsr_errs != ch_flt->afsr_errs)) {
+			aflt->flt_stat |= (1ull << C_AFSR_FIRSTFLT_SHIFT);
+		}
+
+		/*
+		 * Check AFSR bits as well as AFSR_EXT bits in order of
+		 * the AFAR overwrite priority. Our stored AFSR_EXT value
+		 * is expected to be zero for those CPUs which do not have
+		 * an AFSR_EXT register.
+		 */
+		for (eccp = ecc_type_to_info; eccp->ec_desc != NULL; eccp++) {
+			if ((eccp->ec_afsr_bit &
+			    (ch_flt->afsr_errs & t_afsr_errs)) &&
+			    ((eccp->ec_flags & aflt->flt_status) != 0)) {
+				cpu_queue_one_event(ch_flt, reason, eccp, cdp);
+				cdp = NULL;
+				t_afsr_errs &= ~eccp->ec_afsr_bit;
+				nevents++;
+			}
+		}
+
+		/*
+		 * If the ME bit is on in the primary AFSR turn all the
+		 * error bits on again that may set the ME bit to make
+		 * sure we see the ME AFSR error logs.
+		 */
+		if ((primary_afsr & C_AFSR_ME) != 0)
+			t_afsr_errs = (orig_t_afsr_errs & C_AFSR_ALL_ME_ERRS);
+	}
+#endif	/* CHEETAH_PLUS */
+
+	if (clop != NULL)
+		cdp = &clop->clo_data;
+
+	/*
+	 * Queue expected errors, error bit and fault type must match
+	 * in the ecc_type_to_info table.
+	 */
+	for (eccp = ecc_type_to_info; t_afsr_errs != 0 && eccp->ec_desc != NULL;
+	    eccp++) {
+		if ((eccp->ec_afsr_bit & t_afsr_errs) != 0 &&
+		    (eccp->ec_flags & aflt->flt_status) != 0) {
+#if defined(SERRANO)
+			/*
+			 * For FRC/FRU errors on Serrano the afar2 captures
+			 * the address and the associated data is
+			 * in the shadow logout area.
+			 */
+			if (eccp->ec_afsr_bit  & (C_AFSR_FRC | C_AFSR_FRU)) {
+				if (clop != NULL)
+					cdp = &clop->clo_sdw_data;
+				aflt->flt_addr = ch_flt->afar2;
+			} else {
+				if (clop != NULL)
+					cdp = &clop->clo_data;
+				aflt->flt_addr = primary_afar;
+			}
+#else	/* SERRANO */
+			aflt->flt_addr = primary_afar;
+#endif	/* SERRANO */
+			aflt->flt_stat = primary_afsr;
+			ch_flt->afsr_ext = primary_afsr_ext;
+			ch_flt->afsr_errs = primary_afsr_errs;
+			cpu_queue_one_event(ch_flt, reason, eccp, cdp);
+			cdp = NULL;
+			t_afsr_errs &= ~eccp->ec_afsr_bit;
+			nevents++;
+		}
+	}
+
+	/*
+	 * Queue unexpected errors, error bit only match.
+	 */
+	for (eccp = ecc_type_to_info; t_afsr_errs != 0 && eccp->ec_desc != NULL;
+	    eccp++) {
+		if (eccp->ec_afsr_bit & t_afsr_errs) {
+#if defined(SERRANO)
+			/*
+			 * For FRC/FRU errors on Serrano the afar2 captures
+			 * the address and the associated data is
+			 * in the shadow logout area.
+			 */
+			if (eccp->ec_afsr_bit  & (C_AFSR_FRC | C_AFSR_FRU)) {
+				if (clop != NULL)
+					cdp = &clop->clo_sdw_data;
+				aflt->flt_addr = ch_flt->afar2;
+			} else {
+				if (clop != NULL)
+					cdp = &clop->clo_data;
+				aflt->flt_addr = primary_afar;
+			}
+#else	/* SERRANO */
+			aflt->flt_addr = primary_afar;
+#endif	/* SERRANO */
+			aflt->flt_stat = primary_afsr;
+			ch_flt->afsr_ext = primary_afsr_ext;
+			ch_flt->afsr_errs = primary_afsr_errs;
+			cpu_queue_one_event(ch_flt, reason, eccp, cdp);
+			cdp = NULL;
+			t_afsr_errs &= ~eccp->ec_afsr_bit;
+			nevents++;
+		}
+	}
+	return (nevents);
+}
+
+/*
+ * Return trap type number.
+ */
+uint8_t
+flt_to_trap_type(struct async_flt *aflt)
+{
+	if (aflt->flt_status & ECC_I_TRAP)
+		return (TRAP_TYPE_ECC_I);
+	if (aflt->flt_status & ECC_D_TRAP)
+		return (TRAP_TYPE_ECC_D);
+	if (aflt->flt_status & ECC_F_TRAP)
+		return (TRAP_TYPE_ECC_F);
+	if (aflt->flt_status & ECC_C_TRAP)
+		return (TRAP_TYPE_ECC_C);
+	if (aflt->flt_status & ECC_DP_TRAP)
+		return (TRAP_TYPE_ECC_DP);
+	if (aflt->flt_status & ECC_IP_TRAP)
+		return (TRAP_TYPE_ECC_IP);
+	if (aflt->flt_status & ECC_ITLB_TRAP)
+		return (TRAP_TYPE_ECC_ITLB);
+	if (aflt->flt_status & ECC_DTLB_TRAP)
+		return (TRAP_TYPE_ECC_DTLB);
+	return (TRAP_TYPE_UNKNOWN);
+}
+
+/*
+ * Decide an error type based on detector and leaky/partner tests.
+ * The following array is used for quick translation - it must
+ * stay in sync with ce_dispact_t.
+ */
+
+static char *cetypes[] = {
+	CE_DISP_DESC_U,
+	CE_DISP_DESC_I,
+	CE_DISP_DESC_PP,
+	CE_DISP_DESC_P,
+	CE_DISP_DESC_L,
+	CE_DISP_DESC_PS,
+	CE_DISP_DESC_S
+};
+
+char *
+flt_to_error_type(struct async_flt *aflt)
+{
+	ce_dispact_t dispact, disp;
+	uchar_t dtcrinfo, ptnrinfo, lkyinfo;
+
+	/*
+	 * The memory payload bundle is shared by some events that do
+	 * not perform any classification.  For those flt_disp will be
+	 * 0 and we will return "unknown".
+	 */
+	if (!ce_disp_inited || !aflt->flt_in_memory || aflt->flt_disp == 0)
+		return (cetypes[CE_DISP_UNKNOWN]);
+
+	dtcrinfo = CE_XDIAG_DTCRINFO(aflt->flt_disp);
+
+	/*
+	 * It is also possible that no scrub/classification was performed
+	 * by the detector, for instance where a disrupting error logged
+	 * in the AFSR while CEEN was off in cpu_deferred_error.
+	 */
+	if (!CE_XDIAG_EXT_ALG_APPLIED(dtcrinfo))
+		return (cetypes[CE_DISP_UNKNOWN]);
+
+	/*
+	 * Lookup type in initial classification/action table
+	 */
+	dispact = CE_DISPACT(ce_disp_table,
+	    CE_XDIAG_AFARMATCHED(dtcrinfo),
+	    CE_XDIAG_STATE(dtcrinfo),
+	    CE_XDIAG_CE1SEEN(dtcrinfo),
+	    CE_XDIAG_CE2SEEN(dtcrinfo));
+
+	/*
+	 * A bad lookup is not something to panic production systems for.
+	 */
+	ASSERT(dispact != CE_DISP_BAD);
+	if (dispact == CE_DISP_BAD)
+		return (cetypes[CE_DISP_UNKNOWN]);
+
+	disp = CE_DISP(dispact);
+
+	switch (disp) {
+	case CE_DISP_UNKNOWN:
+	case CE_DISP_INTERMITTENT:
+		break;
+
+	case CE_DISP_POSS_PERS:
+		/*
+		 * "Possible persistent" errors to which we have applied a valid
+		 * leaky test can be separated into "persistent" or "leaky".
+		 */
+		lkyinfo = CE_XDIAG_LKYINFO(aflt->flt_disp);
+		if (CE_XDIAG_TESTVALID(lkyinfo)) {
+			if (CE_XDIAG_CE1SEEN(lkyinfo) ||
+			    CE_XDIAG_CE2SEEN(lkyinfo))
+				disp = CE_DISP_LEAKY;
+			else
+				disp = CE_DISP_PERS;
+		}
+		break;
+
+	case CE_DISP_POSS_STICKY:
+		/*
+		 * Promote "possible sticky" results that have been
+		 * confirmed by a partner test to "sticky".  Unconfirmed
+		 * "possible sticky" events are left at that status - we do not
+		 * guess at any bad reader/writer etc status here.
+		 */
+		ptnrinfo = CE_XDIAG_PTNRINFO(aflt->flt_disp);
+		if (CE_XDIAG_TESTVALID(ptnrinfo) &&
+		    CE_XDIAG_CE1SEEN(ptnrinfo) && CE_XDIAG_CE2SEEN(ptnrinfo))
+			disp = CE_DISP_STICKY;
+
+		/*
+		 * Promote "possible sticky" results on a uniprocessor
+		 * to "sticky"
+		 */
+		if (disp == CE_DISP_POSS_STICKY &&
+		    CE_XDIAG_SKIPCODE(disp) == CE_XDIAG_SKIP_UNIPROC)
+			disp = CE_DISP_STICKY;
+		break;
+
+	default:
+		disp = CE_DISP_UNKNOWN;
+		break;
+	}
+
+	return (cetypes[disp]);
+}
+
+/*
+ * Given the entire afsr, the specific bit to check and a prioritized list of
+ * error bits, determine the validity of the various overwrite priority
+ * features of the AFSR/AFAR: AFAR, ESYND and MSYND, each of which have
+ * different overwrite priorities.
+ *
+ * Given a specific afsr error bit and the entire afsr, there are three cases:
+ *   INVALID:	The specified bit is lower overwrite priority than some other
+ *		error bit which is on in the afsr (or IVU/IVC).
+ *   VALID:	The specified bit is higher priority than all other error bits
+ *		which are on in the afsr.
+ *   AMBIGUOUS: Another error bit (or bits) of equal priority to the specified
+ *		bit is on in the afsr.
+ */
+int
+afsr_to_overw_status(uint64_t afsr, uint64_t afsr_bit, uint64_t *ow_bits)
+{
+	uint64_t afsr_ow;
+
+	while ((afsr_ow = *ow_bits++) != 0) {
+		/*
+		 * If bit is in the priority class, check to see if another
+		 * bit in the same class is on => ambiguous.  Otherwise,
+		 * the value is valid.  If the bit is not on at this priority
+		 * class, but a higher priority bit is on, then the value is
+		 * invalid.
+		 */
+		if (afsr_ow & afsr_bit) {
+			/*
+			 * If equal pri bit is on, ambiguous.
+			 */
+			if (afsr & (afsr_ow & ~afsr_bit))
+				return (AFLT_STAT_AMBIGUOUS);
+			return (AFLT_STAT_VALID);
+		} else if (afsr & afsr_ow)
+			break;
+	}
+
+	/*
+	 * We didn't find a match or a higher priority bit was on.  Not
+	 * finding a match handles the case of invalid AFAR for IVC, IVU.
+	 */
+	return (AFLT_STAT_INVALID);
+}
+
+static int
+afsr_to_afar_status(uint64_t afsr, uint64_t afsr_bit)
+{
+#if defined(SERRANO)
+	if (afsr_bit & (C_AFSR_FRC | C_AFSR_FRU))
+		return (afsr_to_overw_status(afsr, afsr_bit, afar2_overwrite));
+	else
+#endif	/* SERRANO */
+		return (afsr_to_overw_status(afsr, afsr_bit, afar_overwrite));
+}
+
+static int
+afsr_to_esynd_status(uint64_t afsr, uint64_t afsr_bit)
+{
+	return (afsr_to_overw_status(afsr, afsr_bit, esynd_overwrite));
+}
+
+static int
+afsr_to_msynd_status(uint64_t afsr, uint64_t afsr_bit)
+{
+	return (afsr_to_overw_status(afsr, afsr_bit, msynd_overwrite));
+}
+
+static int
+afsr_to_synd_status(uint_t cpuid, uint64_t afsr, uint64_t afsr_bit)
+{
+#ifdef lint
+	cpuid = cpuid;
+#endif
+	if (afsr_bit & C_AFSR_MSYND_ERRS) {
+		return (afsr_to_msynd_status(afsr, afsr_bit));
+	} else if (afsr_bit & (C_AFSR_ESYND_ERRS | C_AFSR_EXT_ESYND_ERRS)) {
+#if defined(CHEETAH_PLUS)
+		/*
+		 * The E_SYND overwrite policy is slightly different
+		 * for Panther CPUs.
+		 */
+		if (IS_PANTHER(cpunodes[cpuid].implementation))
+			return (afsr_to_pn_esynd_status(afsr, afsr_bit));
+		else
+			return (afsr_to_esynd_status(afsr, afsr_bit));
+#else /* CHEETAH_PLUS */
+		return (afsr_to_esynd_status(afsr, afsr_bit));
+#endif /* CHEETAH_PLUS */
+	} else {
+		return (AFLT_STAT_INVALID);
+	}
+}
+
+/*
+ * Slave CPU stick synchronization.
+ */
+void
+sticksync_slave(void)
+{
+	int 		i;
+	int		tries = 0;
+	int64_t		tskew;
+	int64_t		av_tskew;
+
+	kpreempt_disable();
+	/* wait for the master side */
+	while (stick_sync_cmd != SLAVE_START)
+		;
+	/*
+	 * Synchronization should only take a few tries at most. But in the
+	 * odd case where the cpu isn't cooperating we'll keep trying. A cpu
+	 * without it's stick synchronized wouldn't be a good citizen.
+	 */
+	while (slave_done == 0) {
+		/*
+		 * Time skew calculation.
+		 */
+		av_tskew = tskew = 0;
+
+		for (i = 0; i < stick_iter; i++) {
+			/* make location hot */
+			timestamp[EV_A_START] = 0;
+			stick_timestamp(&timestamp[EV_A_START]);
+
+			/* tell the master we're ready */
+			stick_sync_cmd = MASTER_START;
+
+			/* and wait */
+			while (stick_sync_cmd != SLAVE_CONT)
+				;
+			/* Event B end */
+			stick_timestamp(&timestamp[EV_B_END]);
+
+			/* calculate time skew */
+			tskew = ((timestamp[EV_B_END] - timestamp[EV_B_START])
+				- (timestamp[EV_A_END] -
+				timestamp[EV_A_START])) / 2;
+
+			/* keep running count */
+			av_tskew += tskew;
+		} /* for */
+
+		/*
+		 * Adjust stick for time skew if not within the max allowed;
+		 * otherwise we're all done.
+		 */
+		if (stick_iter != 0)
+			av_tskew = av_tskew/stick_iter;
+		if (ABS(av_tskew) > stick_tsk) {
+			/*
+			 * If the skew is 1 (the slave's STICK register
+			 * is 1 STICK ahead of the master's), stick_adj
+			 * could fail to adjust the slave's STICK register
+			 * if the STICK read on the slave happens to
+			 * align with the increment of the STICK.
+			 * Therefore, we increment the skew to 2.
+			 */
+			if (av_tskew == 1)
+				av_tskew++;
+			stick_adj(-av_tskew);
+		} else
+			slave_done = 1;
+#ifdef DEBUG
+		if (tries < DSYNC_ATTEMPTS)
+			stick_sync_stats[CPU->cpu_id].skew_val[tries] =
+				av_tskew;
+		++tries;
+#endif /* DEBUG */
+#ifdef lint
+		tries = tries;
+#endif
+
+	} /* while */
+
+	/* allow the master to finish */
+	stick_sync_cmd = EVENT_NULL;
+	kpreempt_enable();
+}
+
+/*
+ * Master CPU side of stick synchronization.
+ *  - timestamp end of Event A
+ *  - timestamp beginning of Event B
+ */
+void
+sticksync_master(void)
+{
+	int		i;
+
+	kpreempt_disable();
+	/* tell the slave we've started */
+	slave_done = 0;
+	stick_sync_cmd = SLAVE_START;
+
+	while (slave_done == 0) {
+		for (i = 0; i < stick_iter; i++) {
+			/* wait for the slave */
+			while (stick_sync_cmd != MASTER_START)
+				;
+			/* Event A end */
+			stick_timestamp(&timestamp[EV_A_END]);
+
+			/* make location hot */
+			timestamp[EV_B_START] = 0;
+			stick_timestamp(&timestamp[EV_B_START]);
+
+			/* tell the slave to continue */
+			stick_sync_cmd = SLAVE_CONT;
+		} /* for */
+
+		/* wait while slave calculates time skew */
+		while (stick_sync_cmd == SLAVE_CONT)
+			;
+	} /* while */
+	kpreempt_enable();
+}
+
+/*
+ * Cheetah/Cheetah+ have disrupting error for copyback's, so we don't need to
+ * do Spitfire hack of xcall'ing all the cpus to ask to check for them.  Also,
+ * in cpu_async_panic_callb, each cpu checks for CPU events on its way to
+ * panic idle.
+ */
+/*ARGSUSED*/
+void
+cpu_check_allcpus(struct async_flt *aflt)
+{}
+
+struct kmem_cache *ch_private_cache;
+
+/*
+ * Cpu private unitialization.  Uninitialize the Ecache scrubber and
+ * deallocate the scrubber data structures and cpu_private data structure.
+ */
+void
+cpu_uninit_private(struct cpu *cp)
+{
+	cheetah_private_t *chprp = CPU_PRIVATE(cp);
+
+	ASSERT(chprp);
+	cpu_uninit_ecache_scrub_dr(cp);
+	CPU_PRIVATE(cp) = NULL;
+	ch_err_tl1_paddrs[cp->cpu_id] = NULL;
+	kmem_cache_free(ch_private_cache, chprp);
+	cmp_delete_cpu(cp->cpu_id);
+
+}
+
+/*
+ * Cheetah Cache Scrubbing
+ *
+ * The primary purpose of Cheetah cache scrubbing is to reduce the exposure
+ * of E$ tags, D$ data, and I$ data to cosmic ray events since they are not
+ * protected by either parity or ECC.
+ *
+ * We currently default the E$ and D$ scan rate to 100 (scan 10% of the
+ * cache per second). Due to the the specifics of how the I$ control
+ * logic works with respect to the ASI used to scrub I$ lines, the entire
+ * I$ is scanned at once.
+ */
+
+/*
+ * Tuneables to enable and disable the scrubbing of the caches, and to tune
+ * scrubbing behavior.  These may be changed via /etc/system or using mdb
+ * on a running system.
+ */
+int dcache_scrub_enable = 1;		/* D$ scrubbing is on by default */
+
+/*
+ * The following are the PIL levels that the softints/cross traps will fire at.
+ */
+uint_t ecache_scrub_pil = PIL_9;	/* E$ scrub PIL for cross traps */
+uint_t dcache_scrub_pil = PIL_9;	/* D$ scrub PIL for cross traps */
+uint_t icache_scrub_pil = PIL_9;	/* I$ scrub PIL for cross traps */
+
+#if defined(JALAPENO)
+
+/*
+ * Due to several errata (82, 85, 86), we don't enable the L2$ scrubber
+ * on Jalapeno.
+ */
+int ecache_scrub_enable = 0;
+
+#else	/* JALAPENO */
+
+/*
+ * With all other cpu types, E$ scrubbing is on by default
+ */
+int ecache_scrub_enable = 1;
+
+#endif	/* JALAPENO */
+
+
+#if defined(CHEETAH_PLUS) || defined(JALAPENO) || defined(SERRANO)
+
+/*
+ * The I$ scrubber tends to cause latency problems for real-time SW, so it
+ * is disabled by default on non-Cheetah systems
+ */
+int icache_scrub_enable = 0;
+
+/*
+ * Tuneables specifying the scrub calls per second and the scan rate
+ * for each cache
+ *
+ * The cyclic times are set during boot based on the following values.
+ * Changing these values in mdb after this time will have no effect.  If
+ * a different value is desired, it must be set in /etc/system before a
+ * reboot.
+ */
+int ecache_calls_a_sec = 1;
+int dcache_calls_a_sec = 2;
+int icache_calls_a_sec = 2;
+
+int ecache_scan_rate_idle = 1;
+int ecache_scan_rate_busy = 1;
+int dcache_scan_rate_idle = 1;
+int dcache_scan_rate_busy = 1;
+int icache_scan_rate_idle = 1;
+int icache_scan_rate_busy = 1;
+
+#else	/* CHEETAH_PLUS || JALAPENO || SERRANO */
+
+int icache_scrub_enable = 1;		/* I$ scrubbing is on by default */
+
+int ecache_calls_a_sec = 100;		/* E$ scrub calls per seconds */
+int dcache_calls_a_sec = 100;		/* D$ scrub calls per seconds */
+int icache_calls_a_sec = 100;		/* I$ scrub calls per seconds */
+
+int ecache_scan_rate_idle = 100;	/* E$ scan rate (in tenths of a %) */
+int ecache_scan_rate_busy = 100;	/* E$ scan rate (in tenths of a %) */
+int dcache_scan_rate_idle = 100;	/* D$ scan rate (in tenths of a %) */
+int dcache_scan_rate_busy = 100;	/* D$ scan rate (in tenths of a %) */
+int icache_scan_rate_idle = 100;	/* I$ scan rate (in tenths of a %) */
+int icache_scan_rate_busy = 100;	/* I$ scan rate (in tenths of a %) */
+
+#endif	/* CHEETAH_PLUS || JALAPENO || SERRANO */
+
+/*
+ * In order to scrub on offline cpus, a cross trap is sent.  The handler will
+ * increment the outstanding request counter and schedule a softint to run
+ * the scrubber.
+ */
+extern xcfunc_t cache_scrubreq_tl1;
+
+/*
+ * These are the softint functions for each cache scrubber
+ */
+static uint_t scrub_ecache_line_intr(caddr_t arg1, caddr_t arg2);
+static uint_t scrub_dcache_line_intr(caddr_t arg1, caddr_t arg2);
+static uint_t scrub_icache_line_intr(caddr_t arg1, caddr_t arg2);
+
+/*
+ * The cache scrub info table contains cache specific information
+ * and allows for some of the scrub code to be table driven, reducing
+ * duplication of cache similar code.
+ *
+ * This table keeps a copy of the value in the calls per second variable
+ * (?cache_calls_a_sec).  This makes it much more difficult for someone
+ * to cause us problems (for example, by setting ecache_calls_a_sec to 0 in
+ * mdb in a misguided attempt to disable the scrubber).
+ */
+struct scrub_info {
+	int		*csi_enable;	/* scrubber enable flag */
+	int		csi_freq;	/* scrubber calls per second */
+	int		csi_index;	/* index to chsm_outstanding[] */
+	uint_t		csi_inum;	/* scrubber interrupt number */
+	cyclic_id_t	csi_omni_cyc_id;	/* omni cyclic ID */
+	cyclic_id_t	csi_offline_cyc_id;	/* offline cyclic ID */
+	char		csi_name[3];	/* cache name for this scrub entry */
+} cache_scrub_info[] = {
+{ &ecache_scrub_enable, 0, CACHE_SCRUBBER_INFO_E, 0, 0, 0, "E$"},
+{ &dcache_scrub_enable, 0, CACHE_SCRUBBER_INFO_D, 0, 0, 0, "D$"},
+{ &icache_scrub_enable, 0, CACHE_SCRUBBER_INFO_I, 0, 0, 0, "I$"}
+};
+
+/*
+ * If scrubbing is enabled, increment the outstanding request counter.  If it
+ * is 1 (meaning there were no previous requests outstanding), call
+ * setsoftint_tl1 through xt_one_unchecked, which eventually ends up doing
+ * a self trap.
+ */
+static void
+do_scrub(struct scrub_info *csi)
+{
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+	int index = csi->csi_index;
+	uint32_t *outstanding = &csmp->chsm_outstanding[index];
+
+	if (*(csi->csi_enable) && (csmp->chsm_enable[index])) {
+		if (atomic_add_32_nv(outstanding, 1) == 1) {
+			xt_one_unchecked(CPU->cpu_id, setsoftint_tl1,
+			    csi->csi_inum, 0);
+		}
+	}
+}
+
+/*
+ * Omni cyclics don't fire on offline cpus, so we use another cyclic to
+ * cross-trap the offline cpus.
+ */
+static void
+do_scrub_offline(struct scrub_info *csi)
+{
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+
+	if (CPUSET_ISNULL(cpu_offline_set)) {
+		/*
+		 * No offline cpus - nothing to do
+		 */
+		return;
+	}
+
+	if (*(csi->csi_enable) && (csmp->chsm_enable[csi->csi_index])) {
+		xt_some(cpu_offline_set, cache_scrubreq_tl1, csi->csi_inum,
+		    csi->csi_index);
+	}
+}
+
+/*
+ * This is the initial setup for the scrubber cyclics - it sets the
+ * interrupt level, frequency, and function to call.
+ */
+/*ARGSUSED*/
+static void
+cpu_scrub_cyclic_setup(void *arg, cpu_t *cpu, cyc_handler_t *hdlr,
+    cyc_time_t *when)
+{
+	struct scrub_info *csi = (struct scrub_info *)arg;
+
+	ASSERT(csi != NULL);
+	hdlr->cyh_func = (cyc_func_t)do_scrub;
+	hdlr->cyh_level = CY_LOW_LEVEL;
+	hdlr->cyh_arg = arg;
+
+	when->cyt_when = 0;	/* Start immediately */
+	when->cyt_interval = NANOSEC / csi->csi_freq;
+}
+
+/*
+ * Initialization for cache scrubbing.
+ * This routine is called AFTER all cpus have had cpu_init_private called
+ * to initialize their private data areas.
+ */
+void
+cpu_init_cache_scrub(void)
+{
+	int i;
+	struct scrub_info *csi;
+	cyc_omni_handler_t omni_hdlr;
+	cyc_handler_t offline_hdlr;
+	cyc_time_t when;
+
+	/*
+	 * save away the maximum number of lines for the D$
+	 */
+	dcache_nlines = dcache_size / dcache_linesize;
+
+	/*
+	 * register the softints for the cache scrubbing
+	 */
+	cache_scrub_info[CACHE_SCRUBBER_INFO_E].csi_inum =
+	    add_softintr(ecache_scrub_pil, scrub_ecache_line_intr,
+	    (caddr_t)&cache_scrub_info[CACHE_SCRUBBER_INFO_E]);
+	cache_scrub_info[CACHE_SCRUBBER_INFO_E].csi_freq = ecache_calls_a_sec;
+
+	cache_scrub_info[CACHE_SCRUBBER_INFO_D].csi_inum =
+	    add_softintr(dcache_scrub_pil, scrub_dcache_line_intr,
+	    (caddr_t)&cache_scrub_info[CACHE_SCRUBBER_INFO_D]);
+	cache_scrub_info[CACHE_SCRUBBER_INFO_D].csi_freq = dcache_calls_a_sec;
+
+	cache_scrub_info[CACHE_SCRUBBER_INFO_I].csi_inum =
+	    add_softintr(icache_scrub_pil, scrub_icache_line_intr,
+	    (caddr_t)&cache_scrub_info[CACHE_SCRUBBER_INFO_I]);
+	cache_scrub_info[CACHE_SCRUBBER_INFO_I].csi_freq = icache_calls_a_sec;
+
+	/*
+	 * start the scrubbing for all the caches
+	 */
+	mutex_enter(&cpu_lock);
+	for (i = 0; i < CACHE_SCRUBBER_COUNT; i++) {
+
+		csi = &cache_scrub_info[i];
+
+		if (!(*csi->csi_enable))
+			continue;
+
+		/*
+		 * force the following to be true:
+		 *	1 <= calls_a_sec <= hz
+		 */
+		if (csi->csi_freq > hz) {
+			cmn_err(CE_NOTE, "%s scrub calls_a_sec set too high "
+				"(%d); resetting to hz (%d)", csi->csi_name,
+				csi->csi_freq, hz);
+			csi->csi_freq = hz;
+		} else if (csi->csi_freq < 1) {
+			cmn_err(CE_NOTE, "%s scrub calls_a_sec set too low "
+				"(%d); resetting to 1", csi->csi_name,
+				csi->csi_freq);
+			csi->csi_freq = 1;
+		}
+
+		omni_hdlr.cyo_online = cpu_scrub_cyclic_setup;
+		omni_hdlr.cyo_offline = NULL;
+		omni_hdlr.cyo_arg = (void *)csi;
+
+		offline_hdlr.cyh_func = (cyc_func_t)do_scrub_offline;
+		offline_hdlr.cyh_arg = (void *)csi;
+		offline_hdlr.cyh_level = CY_LOW_LEVEL;
+
+		when.cyt_when = 0;	/* Start immediately */
+		when.cyt_interval = NANOSEC / csi->csi_freq;
+
+		csi->csi_omni_cyc_id = cyclic_add_omni(&omni_hdlr);
+		csi->csi_offline_cyc_id = cyclic_add(&offline_hdlr, &when);
+	}
+	register_cpu_setup_func(cpu_scrub_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+}
+
+/*
+ * Indicate that the specified cpu is idle.
+ */
+void
+cpu_idle_ecache_scrub(struct cpu *cp)
+{
+	if (CPU_PRIVATE(cp) != NULL) {
+		ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(cp, chpr_scrub_misc);
+		csmp->chsm_ecache_busy = ECACHE_CPU_IDLE;
+	}
+}
+
+/*
+ * Indicate that the specified cpu is busy.
+ */
+void
+cpu_busy_ecache_scrub(struct cpu *cp)
+{
+	if (CPU_PRIVATE(cp) != NULL) {
+		ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(cp, chpr_scrub_misc);
+		csmp->chsm_ecache_busy = ECACHE_CPU_BUSY;
+	}
+}
+
+/*
+ * Initialization for cache scrubbing for the specified cpu.
+ */
+void
+cpu_init_ecache_scrub_dr(struct cpu *cp)
+{
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(cp, chpr_scrub_misc);
+	int cpuid = cp->cpu_id;
+
+	/* initialize the number of lines in the caches */
+	csmp->chsm_ecache_nlines = cpunodes[cpuid].ecache_size /
+	    cpunodes[cpuid].ecache_linesize;
+	csmp->chsm_icache_nlines = CPU_PRIVATE_VAL(cp, chpr_icache_size) /
+	    CPU_PRIVATE_VAL(cp, chpr_icache_linesize);
+
+	/*
+	 * do_scrub() and do_scrub_offline() check both the global
+	 * ?cache_scrub_enable and this per-cpu enable variable.  All scrubbers
+	 * check this value before scrubbing.  Currently, we use it to
+	 * disable the E$ scrubber on multi-core cpus or while running at
+	 * slowed speed.  For now, just turn everything on and allow
+	 * cpu_init_private() to change it if necessary.
+	 */
+	csmp->chsm_enable[CACHE_SCRUBBER_INFO_E] = 1;
+	csmp->chsm_enable[CACHE_SCRUBBER_INFO_D] = 1;
+	csmp->chsm_enable[CACHE_SCRUBBER_INFO_I] = 1;
+
+	cpu_busy_ecache_scrub(cp);
+}
+
+/*
+ * Un-initialization for cache scrubbing for the specified cpu.
+ */
+static void
+cpu_uninit_ecache_scrub_dr(struct cpu *cp)
+{
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(cp, chpr_scrub_misc);
+
+	/*
+	 * un-initialize bookkeeping for cache scrubbing
+	 */
+	bzero(csmp, sizeof (ch_scrub_misc_t));
+
+	cpu_idle_ecache_scrub(cp);
+}
+
+/*
+ * Called periodically on each CPU to scrub the D$.
+ */
+static void
+scrub_dcache(int how_many)
+{
+	int i;
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+	int index = csmp->chsm_flush_index[CACHE_SCRUBBER_INFO_D];
+
+	/*
+	 * scrub the desired number of lines
+	 */
+	for (i = 0; i < how_many; i++) {
+		/*
+		 * scrub a D$ line
+		 */
+		dcache_inval_line(index);
+
+		/*
+		 * calculate the next D$ line to scrub, assumes
+		 * that dcache_nlines is a power of 2
+		 */
+		index = (index + 1) & (dcache_nlines - 1);
+	}
+
+	/*
+	 * set the scrub index for the next visit
+	 */
+	csmp->chsm_flush_index[CACHE_SCRUBBER_INFO_D] = index;
+}
+
+/*
+ * Handler for D$ scrub inum softint. Call scrub_dcache until
+ * we decrement the outstanding request count to zero.
+ */
+/*ARGSUSED*/
+static uint_t
+scrub_dcache_line_intr(caddr_t arg1, caddr_t arg2)
+{
+	int i;
+	int how_many;
+	int outstanding;
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+	uint32_t *countp = &csmp->chsm_outstanding[CACHE_SCRUBBER_INFO_D];
+	struct scrub_info *csi = (struct scrub_info *)arg1;
+	int scan_rate = (csmp->chsm_ecache_busy == ECACHE_CPU_IDLE) ?
+		dcache_scan_rate_idle : dcache_scan_rate_busy;
+
+	/*
+	 * The scan rates are expressed in units of tenths of a
+	 * percent.  A scan rate of 1000 (100%) means the whole
+	 * cache is scanned every second.
+	 */
+	how_many = (dcache_nlines * scan_rate) / (1000 * csi->csi_freq);
+
+	do {
+		outstanding = *countp;
+		ASSERT(outstanding > 0);
+		for (i = 0; i < outstanding; i++) {
+			scrub_dcache(how_many);
+		}
+	} while (atomic_add_32_nv(countp, -outstanding));
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Called periodically on each CPU to scrub the I$. The I$ is scrubbed
+ * by invalidating lines. Due to the characteristics of the ASI which
+ * is used to invalidate an I$ line, the entire I$ must be invalidated
+ * vs. an individual I$ line.
+ */
+static void
+scrub_icache(int how_many)
+{
+	int i;
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+	int index = csmp->chsm_flush_index[CACHE_SCRUBBER_INFO_I];
+	int icache_nlines = csmp->chsm_icache_nlines;
+
+	/*
+	 * scrub the desired number of lines
+	 */
+	for (i = 0; i < how_many; i++) {
+		/*
+		 * since the entire I$ must be scrubbed at once,
+		 * wait until the index wraps to zero to invalidate
+		 * the entire I$
+		 */
+		if (index == 0) {
+			icache_inval_all();
+		}
+
+		/*
+		 * calculate the next I$ line to scrub, assumes
+		 * that chsm_icache_nlines is a power of 2
+		 */
+		index = (index + 1) & (icache_nlines - 1);
+	}
+
+	/*
+	 * set the scrub index for the next visit
+	 */
+	csmp->chsm_flush_index[CACHE_SCRUBBER_INFO_I] = index;
+}
+
+/*
+ * Handler for I$ scrub inum softint. Call scrub_icache until
+ * we decrement the outstanding request count to zero.
+ */
+/*ARGSUSED*/
+static uint_t
+scrub_icache_line_intr(caddr_t arg1, caddr_t arg2)
+{
+	int i;
+	int how_many;
+	int outstanding;
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+	uint32_t *countp = &csmp->chsm_outstanding[CACHE_SCRUBBER_INFO_I];
+	struct scrub_info *csi = (struct scrub_info *)arg1;
+	int scan_rate = (csmp->chsm_ecache_busy == ECACHE_CPU_IDLE) ?
+	    icache_scan_rate_idle : icache_scan_rate_busy;
+	int icache_nlines = csmp->chsm_icache_nlines;
+
+	/*
+	 * The scan rates are expressed in units of tenths of a
+	 * percent.  A scan rate of 1000 (100%) means the whole
+	 * cache is scanned every second.
+	 */
+	how_many = (icache_nlines * scan_rate) / (1000 * csi->csi_freq);
+
+	do {
+		outstanding = *countp;
+		ASSERT(outstanding > 0);
+		for (i = 0; i < outstanding; i++) {
+			scrub_icache(how_many);
+		}
+	} while (atomic_add_32_nv(countp, -outstanding));
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Called periodically on each CPU to scrub the E$.
+ */
+static void
+scrub_ecache(int how_many)
+{
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+	int i;
+	int cpuid = CPU->cpu_id;
+	int index = csmp->chsm_flush_index[CACHE_SCRUBBER_INFO_E];
+	int nlines = csmp->chsm_ecache_nlines;
+	int linesize = cpunodes[cpuid].ecache_linesize;
+	int ec_set_size = cpu_ecache_set_size(CPU);
+
+	/*
+	 * scrub the desired number of lines
+	 */
+	for (i = 0; i < how_many; i++) {
+		/*
+		 * scrub the E$ line
+		 */
+		ecache_flush_line(ecache_flushaddr + (index * linesize),
+		    ec_set_size);
+
+		/*
+		 * calculate the next E$ line to scrub based on twice
+		 * the number of E$ lines (to displace lines containing
+		 * flush area data), assumes that the number of lines
+		 * is a power of 2
+		 */
+		index = (index + 1) & ((nlines << 1) - 1);
+	}
+
+	/*
+	 * set the ecache scrub index for the next visit
+	 */
+	csmp->chsm_flush_index[CACHE_SCRUBBER_INFO_E] = index;
+}
+
+/*
+ * Handler for E$ scrub inum softint. Call the E$ scrubber until
+ * we decrement the outstanding request count to zero.
+ */
+/*ARGSUSED*/
+static uint_t
+scrub_ecache_line_intr(caddr_t arg1, caddr_t arg2)
+{
+	int i;
+	int how_many;
+	int outstanding;
+	ch_scrub_misc_t *csmp = CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+	uint32_t *countp = &csmp->chsm_outstanding[CACHE_SCRUBBER_INFO_E];
+	struct scrub_info *csi = (struct scrub_info *)arg1;
+	int scan_rate = (csmp->chsm_ecache_busy == ECACHE_CPU_IDLE) ?
+		ecache_scan_rate_idle : ecache_scan_rate_busy;
+	int ecache_nlines = csmp->chsm_ecache_nlines;
+
+	/*
+	 * The scan rates are expressed in units of tenths of a
+	 * percent.  A scan rate of 1000 (100%) means the whole
+	 * cache is scanned every second.
+	 */
+	how_many = (ecache_nlines * scan_rate) / (1000 * csi->csi_freq);
+
+	do {
+		outstanding = *countp;
+		ASSERT(outstanding > 0);
+		for (i = 0; i < outstanding; i++) {
+			scrub_ecache(how_many);
+		}
+	} while (atomic_add_32_nv(countp, -outstanding));
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Timeout function to reenable CE
+ */
+static void
+cpu_delayed_check_ce_errors(void *arg)
+{
+	if (!taskq_dispatch(ch_check_ce_tq, cpu_check_ce_errors, arg,
+	    TQ_NOSLEEP)) {
+		(void) timeout(cpu_delayed_check_ce_errors, arg,
+		    drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
+	}
+}
+
+/*
+ * CE Deferred Re-enable after trap.
+ *
+ * When the CPU gets a disrupting trap for any of the errors
+ * controlled by the CEEN bit, CEEN is disabled in the trap handler
+ * immediately. To eliminate the possibility of multiple CEs causing
+ * recursive stack overflow in the trap handler, we cannot
+ * reenable CEEN while still running in the trap handler. Instead,
+ * after a CE is logged on a CPU, we schedule a timeout function,
+ * cpu_check_ce_errors(), to trigger after cpu_ceen_delay_secs
+ * seconds. This function will check whether any further CEs
+ * have occurred on that CPU, and if none have, will reenable CEEN.
+ *
+ * If further CEs have occurred while CEEN is disabled, another
+ * timeout will be scheduled. This is to ensure that the CPU can
+ * make progress in the face of CE 'storms', and that it does not
+ * spend all its time logging CE errors.
+ */
+static void
+cpu_check_ce_errors(void *arg)
+{
+	int	cpuid = (int)arg;
+	cpu_t	*cp;
+
+	/*
+	 * We acquire cpu_lock.
+	 */
+	ASSERT(curthread->t_pil == 0);
+
+	/*
+	 * verify that the cpu is still around, DR
+	 * could have got there first ...
+	 */
+	mutex_enter(&cpu_lock);
+	cp = cpu_get(cpuid);
+	if (cp == NULL) {
+		mutex_exit(&cpu_lock);
+		return;
+	}
+	/*
+	 * make sure we don't migrate across CPUs
+	 * while checking our CE status.
+	 */
+	kpreempt_disable();
+
+	/*
+	 * If we are running on the CPU that got the
+	 * CE, we can do the checks directly.
+	 */
+	if (cp->cpu_id == CPU->cpu_id) {
+		mutex_exit(&cpu_lock);
+		cpu_check_ce(TIMEOUT_CEEN_CHECK, 0, 0, 0);
+		kpreempt_enable();
+		return;
+	}
+	kpreempt_enable();
+
+	/*
+	 * send an x-call to get the CPU that originally
+	 * got the CE to do the necessary checks. If we can't
+	 * send the x-call, reschedule the timeout, otherwise we
+	 * lose CEEN forever on that CPU.
+	 */
+	if (CPU_XCALL_READY(cp->cpu_id) && (!(cp->cpu_flags & CPU_QUIESCED))) {
+		xc_one(cp->cpu_id, (xcfunc_t *)cpu_check_ce,
+		    TIMEOUT_CEEN_CHECK, 0);
+		mutex_exit(&cpu_lock);
+	} else {
+		/*
+		 * When the CPU is not accepting xcalls, or
+		 * the processor is offlined, we don't want to
+		 * incur the extra overhead of trying to schedule the
+		 * CE timeout indefinitely. However, we don't want to lose
+		 * CE checking forever.
+		 *
+		 * Keep rescheduling the timeout, accepting the additional
+		 * overhead as the cost of correctness in the case where we get
+		 * a CE, disable CEEN, offline the CPU during the
+		 * the timeout interval, and then online it at some
+		 * point in the future. This is unlikely given the short
+		 * cpu_ceen_delay_secs.
+		 */
+		mutex_exit(&cpu_lock);
+		(void) timeout(cpu_delayed_check_ce_errors, (void *)cp->cpu_id,
+		    drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
+	}
+}
+
+/*
+ * This routine will check whether CEs have occurred while
+ * CEEN is disabled. Any CEs detected will be logged and, if
+ * possible, scrubbed.
+ *
+ * The memscrubber will also use this routine to clear any errors
+ * caused by its scrubbing with CEEN disabled.
+ *
+ * flag == SCRUBBER_CEEN_CHECK
+ *		called from memscrubber, just check/scrub, no reset
+ *		paddr 	physical addr. for start of scrub pages
+ *		vaddr 	virtual addr. for scrub area
+ *		psz	page size of area to be scrubbed
+ *
+ * flag == TIMEOUT_CEEN_CHECK
+ *		timeout function has triggered, reset timeout or CEEN
+ *
+ * Note: We must not migrate cpus during this function.  This can be
+ * achieved by one of:
+ *    - invoking as target of an x-call in which case we're at XCALL_PIL
+ *	The flag value must be first xcall argument.
+ *    - disabling kernel preemption.  This should be done for very short
+ *	periods so is not suitable for SCRUBBER_CEEN_CHECK where we might
+ *	scrub an extended area with cpu_check_block.  The call for
+ *	TIMEOUT_CEEN_CHECK uses this so cpu_check_ce must be kept
+ *	brief for this case.
+ *    - binding to a cpu, eg with thread_affinity_set().  This is used
+ *	in the SCRUBBER_CEEN_CHECK case, but is not practical for
+ *	the TIMEOUT_CEEN_CHECK because both need cpu_lock.
+ */
+void
+cpu_check_ce(int flag, uint64_t pa, caddr_t va, uint_t psz)
+{
+	ch_cpu_errors_t	cpu_error_regs;
+	uint64_t	ec_err_enable;
+	uint64_t	page_offset;
+
+	/* Read AFSR */
+	get_cpu_error_state(&cpu_error_regs);
+
+	/*
+	 * If no CEEN errors have occurred during the timeout
+	 * interval, it is safe to re-enable CEEN and exit.
+	 */
+	if ((cpu_error_regs.afsr & C_AFSR_CECC_ERRS) == 0) {
+		if (flag == TIMEOUT_CEEN_CHECK &&
+		    !((ec_err_enable = get_error_enable()) & EN_REG_CEEN))
+			set_error_enable(ec_err_enable | EN_REG_CEEN);
+		return;
+	}
+
+	/*
+	 * Ensure that CEEN was not reenabled (maybe by DR) before
+	 * we log/clear the error.
+	 */
+	if ((ec_err_enable = get_error_enable()) & EN_REG_CEEN)
+	    set_error_enable(ec_err_enable & ~EN_REG_CEEN);
+
+	/*
+	 * log/clear the CE. If CE_CEEN_DEFER is passed, the
+	 * timeout will be rescheduled when the error is logged.
+	 */
+	if (!(cpu_error_regs.afsr & cpu_ce_not_deferred))
+	    cpu_ce_detected(&cpu_error_regs,
+		CE_CEEN_DEFER | CE_CEEN_TIMEOUT);
+	else
+	    cpu_ce_detected(&cpu_error_regs, CE_CEEN_TIMEOUT);
+
+	/*
+	 * If the memory scrubber runs while CEEN is
+	 * disabled, (or if CEEN is disabled during the
+	 * scrub as a result of a CE being triggered by
+	 * it), the range being scrubbed will not be
+	 * completely cleaned. If there are multiple CEs
+	 * in the range at most two of these will be dealt
+	 * with, (one by the trap handler and one by the
+	 * timeout). It is also possible that none are dealt
+	 * with, (CEEN disabled and another CE occurs before
+	 * the timeout triggers). So to ensure that the
+	 * memory is actually scrubbed, we have to access each
+	 * memory location in the range and then check whether
+	 * that access causes a CE.
+	 */
+	if (flag == SCRUBBER_CEEN_CHECK && va) {
+		if ((cpu_error_regs.afar >= pa) &&
+		    (cpu_error_regs.afar < (pa + psz))) {
+			/*
+			 * Force a load from physical memory for each
+			 * 64-byte block, then check AFSR to determine
+			 * whether this access caused an error.
+			 *
+			 * This is a slow way to do a scrub, but as it will
+			 * only be invoked when the memory scrubber actually
+			 * triggered a CE, it should not happen too
+			 * frequently.
+			 *
+			 * cut down what we need to check as the scrubber
+			 * has verified up to AFAR, so get it's offset
+			 * into the page and start there.
+			 */
+			page_offset = (uint64_t)(cpu_error_regs.afar &
+			    (psz - 1));
+			va = (caddr_t)(va + (P2ALIGN(page_offset, 64)));
+			psz -= (uint_t)(P2ALIGN(page_offset, 64));
+			cpu_check_block((caddr_t)(P2ALIGN((uint64_t)va, 64)),
+			    psz);
+		}
+	}
+
+	/*
+	 * Reset error enable if this CE is not masked.
+	 */
+	if ((flag == TIMEOUT_CEEN_CHECK) &&
+	    (cpu_error_regs.afsr & cpu_ce_not_deferred))
+	    set_error_enable(ec_err_enable | EN_REG_CEEN);
+
+}
+
+/*
+ * Attempt a cpu logout for an error that we did not trap for, such
+ * as a CE noticed with CEEN off.  It is assumed that we are still running
+ * on the cpu that took the error and that we cannot migrate.  Returns
+ * 0 on success, otherwise nonzero.
+ */
+static int
+cpu_ce_delayed_ec_logout(uint64_t afar)
+{
+	ch_cpu_logout_t *clop;
+
+	if (CPU_PRIVATE(CPU) == NULL)
+		return (0);
+
+	clop = CPU_PRIVATE_PTR(CPU, chpr_cecc_logout);
+	if (cas64(&clop->clo_data.chd_afar, LOGOUT_INVALID, afar) !=
+	    LOGOUT_INVALID)
+		return (0);
+
+	cpu_delayed_logout(afar, clop);
+	return (1);
+}
+
+/*
+ * We got an error while CEEN was disabled. We
+ * need to clean up after it and log whatever
+ * information we have on the CE.
+ */
+void
+cpu_ce_detected(ch_cpu_errors_t *cpu_error_regs, int flag)
+{
+	ch_async_flt_t 	ch_flt;
+	struct async_flt *aflt;
+	char 		pr_reason[MAX_REASON_STRING];
+
+	bzero(&ch_flt, sizeof (ch_async_flt_t));
+	ch_flt.flt_trapped_ce = flag;
+	aflt = (struct async_flt *)&ch_flt;
+	aflt->flt_stat = cpu_error_regs->afsr & C_AFSR_MASK;
+	ch_flt.afsr_ext = cpu_error_regs->afsr_ext;
+	ch_flt.afsr_errs = (cpu_error_regs->afsr_ext & C_AFSR_EXT_ALL_ERRS) |
+	    (cpu_error_regs->afsr & C_AFSR_ALL_ERRS);
+	aflt->flt_addr = cpu_error_regs->afar;
+#if defined(SERRANO)
+	ch_flt.afar2 = cpu_error_regs->afar2;
+#endif	/* SERRANO */
+	aflt->flt_pc = NULL;
+	aflt->flt_priv = ((cpu_error_regs->afsr & C_AFSR_PRIV) != 0);
+	aflt->flt_tl = 0;
+	aflt->flt_panic = 0;
+	cpu_log_and_clear_ce(&ch_flt);
+
+	/*
+	 * check if we caused any errors during cleanup
+	 */
+	if (clear_errors(&ch_flt)) {
+		pr_reason[0] = '\0';
+		(void) cpu_queue_events(&ch_flt, pr_reason, ch_flt.afsr_errs,
+		    NULL);
+	}
+}
+
+/*
+ * Log/clear CEEN-controlled disrupting errors
+ */
+static void
+cpu_log_and_clear_ce(ch_async_flt_t *ch_flt)
+{
+	struct async_flt *aflt;
+	uint64_t afsr, afsr_errs;
+	ch_cpu_logout_t *clop;
+	char 		pr_reason[MAX_REASON_STRING];
+	on_trap_data_t	*otp = curthread->t_ontrap;
+
+	aflt = (struct async_flt *)ch_flt;
+	afsr = aflt->flt_stat;
+	afsr_errs = ch_flt->afsr_errs;
+	aflt->flt_id = gethrtime_waitfree();
+	aflt->flt_bus_id = getprocessorid();
+	aflt->flt_inst = CPU->cpu_id;
+	aflt->flt_prot = AFLT_PROT_NONE;
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_status = ECC_C_TRAP;
+
+	pr_reason[0] = '\0';
+	/*
+	 * Get the CPU log out info for Disrupting Trap.
+	 */
+	if (CPU_PRIVATE(CPU) == NULL) {
+		clop = NULL;
+		ch_flt->flt_diag_data.chd_afar = LOGOUT_INVALID;
+	} else {
+		clop = CPU_PRIVATE_PTR(CPU, chpr_cecc_logout);
+	}
+
+	if (clop && ch_flt->flt_trapped_ce & CE_CEEN_TIMEOUT) {
+		ch_cpu_errors_t cpu_error_regs;
+
+		get_cpu_error_state(&cpu_error_regs);
+		(void) cpu_ce_delayed_ec_logout(cpu_error_regs.afar);
+		clop->clo_data.chd_afsr = cpu_error_regs.afsr;
+		clop->clo_data.chd_afar = cpu_error_regs.afar;
+		clop->clo_data.chd_afsr_ext = cpu_error_regs.afsr_ext;
+		clop->clo_sdw_data.chd_afsr = cpu_error_regs.shadow_afsr;
+		clop->clo_sdw_data.chd_afar = cpu_error_regs.shadow_afar;
+		clop->clo_sdw_data.chd_afsr_ext =
+		    cpu_error_regs.shadow_afsr_ext;
+#if defined(SERRANO)
+		clop->clo_data.chd_afar2 = cpu_error_regs.afar2;
+#endif	/* SERRANO */
+		ch_flt->flt_data_incomplete = 1;
+
+		/*
+		 * The logging/clear code expects AFSR/AFAR to be cleared.
+		 * The trap handler does it for CEEN enabled errors
+		 * so we need to do it here.
+		 */
+		set_cpu_error_state(&cpu_error_regs);
+	}
+
+#if defined(JALAPENO) || defined(SERRANO)
+	/*
+	 * FRC: Can't scrub memory as we don't have AFAR for Jalapeno.
+	 * For Serrano, even thou we do have the AFAR, we still do the
+	 * scrub on the RCE side since that's where the error type can
+	 * be properly classified as intermittent, persistent, etc.
+	 *
+	 * CE/RCE:  If error is in memory and AFAR is valid, scrub the memory.
+	 * Must scrub memory before cpu_queue_events, as scrubbing memory sets
+	 * the flt_status bits.
+	 */
+	if ((afsr & (C_AFSR_CE|C_AFSR_RCE)) &&
+	    (cpu_flt_in_memory(ch_flt, (afsr & C_AFSR_CE)) ||
+	    cpu_flt_in_memory(ch_flt, (afsr & C_AFSR_RCE)))) {
+		cpu_ce_scrub_mem_err(aflt, B_TRUE);
+	}
+#else /* JALAPENO || SERRANO */
+	/*
+	 * CE/EMC:  If error is in memory and AFAR is valid, scrub the memory.
+	 * Must scrub memory before cpu_queue_events, as scrubbing memory sets
+	 * the flt_status bits.
+	 */
+	if (afsr & (C_AFSR_CE|C_AFSR_EMC)) {
+		if (cpu_flt_in_memory(ch_flt, (afsr & C_AFSR_CE)) ||
+		    cpu_flt_in_memory(ch_flt, (afsr & C_AFSR_EMC))) {
+			cpu_ce_scrub_mem_err(aflt, B_TRUE);
+		}
+	}
+
+#endif /* JALAPENO || SERRANO */
+
+	/*
+	 * Update flt_prot if this error occurred under on_trap protection.
+	 */
+	if (otp != NULL && (otp->ot_prot & OT_DATA_EC))
+		aflt->flt_prot = AFLT_PROT_EC;
+
+	/*
+	 * Queue events on the async event queue, one event per error bit.
+	 */
+	if (cpu_queue_events(ch_flt, pr_reason, afsr_errs, clop) == 0 ||
+	    (afsr_errs & (C_AFSR_CECC_ERRS | C_AFSR_EXT_CECC_ERRS)) == 0) {
+		ch_flt->flt_type = CPU_INV_AFSR;
+		cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_INVALID_AFSR,
+		    (void *)ch_flt, sizeof (ch_async_flt_t), ue_queue,
+		    aflt->flt_panic);
+	}
+
+	/*
+	 * Zero out + invalidate CPU logout.
+	 */
+	if (clop) {
+		bzero(clop, sizeof (ch_cpu_logout_t));
+		clop->clo_data.chd_afar = LOGOUT_INVALID;
+	}
+
+	/*
+	 * If either a CPC, WDC or EDC error has occurred while CEEN
+	 * was disabled, we need to flush either the entire
+	 * E$ or an E$ line.
+	 */
+#if defined(JALAPENO) || defined(SERRANO)
+	if (afsr & (C_AFSR_EDC | C_AFSR_CPC | C_AFSR_CPU | C_AFSR_WDC))
+#else	/* JALAPENO || SERRANO */
+	if (afsr_errs & (C_AFSR_EDC | C_AFSR_CPC | C_AFSR_WDC | C_AFSR_L3_EDC |
+	    C_AFSR_L3_CPC | C_AFSR_L3_WDC))
+#endif	/* JALAPENO || SERRANO */
+		cpu_error_ecache_flush(ch_flt);
+
+}
+
+/*
+ * depending on the error type, we determine whether we
+ * need to flush the entire ecache or just a line.
+ */
+static int
+cpu_error_ecache_flush_required(ch_async_flt_t *ch_flt)
+{
+	struct async_flt *aflt;
+	uint64_t	afsr;
+	uint64_t	afsr_errs = ch_flt->afsr_errs;
+
+	aflt = (struct async_flt *)ch_flt;
+	afsr = aflt->flt_stat;
+
+	/*
+	 * If we got multiple errors, no point in trying
+	 * the individual cases, just flush the whole cache
+	 */
+	if (afsr & C_AFSR_ME) {
+		return (ECACHE_FLUSH_ALL);
+	}
+
+	/*
+	 * If either a CPC, WDC or EDC error has occurred while CEEN
+	 * was disabled, we need to flush entire E$. We can't just
+	 * flush the cache line affected as the ME bit
+	 * is not set when multiple correctable errors of the same
+	 * type occur, so we might have multiple CPC or EDC errors,
+	 * with only the first recorded.
+	 */
+#if defined(JALAPENO) || defined(SERRANO)
+	if (afsr & (C_AFSR_CPC | C_AFSR_CPU | C_AFSR_EDC | C_AFSR_WDC)) {
+#else	/* JALAPENO || SERRANO */
+	if (afsr_errs & (C_AFSR_CPC | C_AFSR_EDC | C_AFSR_WDC | C_AFSR_L3_CPC |
+	    C_AFSR_L3_EDC | C_AFSR_L3_WDC)) {
+#endif	/* JALAPENO || SERRANO */
+		return (ECACHE_FLUSH_ALL);
+	}
+
+#if defined(JALAPENO) || defined(SERRANO)
+	/*
+	 * If only UE or RUE is set, flush the Ecache line, otherwise
+	 * flush the entire Ecache.
+	 */
+	if (afsr & (C_AFSR_UE|C_AFSR_RUE)) {
+		if ((afsr & C_AFSR_ALL_ERRS) == C_AFSR_UE ||
+		    (afsr & C_AFSR_ALL_ERRS) == C_AFSR_RUE) {
+			return (ECACHE_FLUSH_LINE);
+		} else {
+			return (ECACHE_FLUSH_ALL);
+		}
+	}
+#else /* JALAPENO || SERRANO */
+	/*
+	 * If UE only is set, flush the Ecache line, otherwise
+	 * flush the entire Ecache.
+	 */
+	if (afsr_errs & C_AFSR_UE) {
+		if ((afsr_errs & (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) ==
+		    C_AFSR_UE) {
+			return (ECACHE_FLUSH_LINE);
+		} else {
+			return (ECACHE_FLUSH_ALL);
+		}
+	}
+#endif /* JALAPENO || SERRANO */
+
+	/*
+	 * EDU: If EDU only is set, flush the ecache line, otherwise
+	 * flush the entire Ecache.
+	 */
+	if (afsr_errs & (C_AFSR_EDU | C_AFSR_L3_EDU)) {
+		if (((afsr_errs & ~C_AFSR_EDU) == 0) ||
+		    ((afsr_errs & ~C_AFSR_L3_EDU) == 0)) {
+			return (ECACHE_FLUSH_LINE);
+		} else {
+			return (ECACHE_FLUSH_ALL);
+		}
+	}
+
+	/*
+	 * BERR: If BERR only is set, flush the Ecache line, otherwise
+	 * flush the entire Ecache.
+	 */
+	if (afsr_errs & C_AFSR_BERR) {
+		if ((afsr_errs & ~C_AFSR_BERR) == 0) {
+			return (ECACHE_FLUSH_LINE);
+		} else {
+			return (ECACHE_FLUSH_ALL);
+		}
+	}
+
+	return (0);
+}
+
+void
+cpu_error_ecache_flush(ch_async_flt_t *ch_flt)
+{
+	int	ecache_flush_flag =
+	    cpu_error_ecache_flush_required(ch_flt);
+
+	/*
+	 * Flush Ecache line or entire Ecache based on above checks.
+	 */
+	if (ecache_flush_flag == ECACHE_FLUSH_ALL)
+		cpu_flush_ecache();
+	else if (ecache_flush_flag == ECACHE_FLUSH_LINE) {
+		cpu_flush_ecache_line(ch_flt);
+	}
+
+}
+
+/*
+ * Extract the PA portion from the E$ tag.
+ */
+uint64_t
+cpu_ectag_to_pa(int setsize, uint64_t tag)
+{
+	if (IS_JAGUAR(cpunodes[CPU->cpu_id].implementation))
+		return (JG_ECTAG_TO_PA(setsize, tag));
+	else if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation))
+		return (PN_L3TAG_TO_PA(tag));
+	else
+		return (CH_ECTAG_TO_PA(setsize, tag));
+}
+
+/*
+ * Convert the E$ tag PA into an E$ subblock index.
+ */
+static int
+cpu_ectag_pa_to_subblk(int cachesize, uint64_t subaddr)
+{
+	if (IS_JAGUAR(cpunodes[CPU->cpu_id].implementation))
+		return (JG_ECTAG_PA_TO_SUBBLK(cachesize, subaddr));
+	else if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation))
+		/* Panther has only one subblock per line */
+		return (0);
+	else
+		return (CH_ECTAG_PA_TO_SUBBLK(cachesize, subaddr));
+}
+
+/*
+ * All subblocks in an E$ line must be invalid for
+ * the line to be invalid.
+ */
+int
+cpu_ectag_line_invalid(int cachesize, uint64_t tag)
+{
+	if (IS_JAGUAR(cpunodes[CPU->cpu_id].implementation))
+		return (JG_ECTAG_LINE_INVALID(cachesize, tag));
+	else if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation))
+		return (PN_L3_LINE_INVALID(tag));
+	else
+		return (CH_ECTAG_LINE_INVALID(cachesize, tag));
+}
+
+/*
+ * Extract state bits for a subblock given the tag.  Note that for Panther
+ * this works on both l2 and l3 tags.
+ */
+static int
+cpu_ectag_pa_to_subblk_state(int cachesize, uint64_t subaddr, uint64_t tag)
+{
+	if (IS_JAGUAR(cpunodes[CPU->cpu_id].implementation))
+		return (JG_ECTAG_PA_TO_SUBBLK_STATE(cachesize, subaddr, tag));
+	else if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation))
+		return (tag & CH_ECSTATE_MASK);
+	else
+		return (CH_ECTAG_PA_TO_SUBBLK_STATE(cachesize, subaddr, tag));
+}
+
+/*
+ * Cpu specific initialization.
+ */
+void
+cpu_mp_init(void)
+{
+#ifdef	CHEETAHPLUS_ERRATUM_25
+	if (cheetah_sendmondo_recover) {
+		cheetah_nudge_init();
+	}
+#endif
+}
+
+void
+cpu_ereport_post(struct async_flt *aflt)
+{
+	char *cpu_type, buf[FM_MAX_CLASS];
+	nv_alloc_t *nva = NULL;
+	nvlist_t *ereport, *detector, *resource;
+	errorq_elem_t *eqep;
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+	char unum[UNUM_NAMLEN];
+	int len = 0;
+	uint8_t  msg_type;
+	plat_ecc_ch_async_flt_t	plat_ecc_ch_flt;
+
+	if (aflt->flt_panic || panicstr) {
+		eqep = errorq_reserve(ereport_errorq);
+		if (eqep == NULL)
+			return;
+		ereport = errorq_elem_nvl(ereport_errorq, eqep);
+		nva = errorq_elem_nva(ereport_errorq, eqep);
+	} else {
+		ereport = fm_nvlist_create(nva);
+	}
+
+	/*
+	 * Create the scheme "cpu" FMRI.
+	 */
+	detector = fm_nvlist_create(nva);
+	resource = fm_nvlist_create(nva);
+	switch (cpunodes[aflt->flt_inst].implementation) {
+	case CHEETAH_IMPL:
+		cpu_type = FM_EREPORT_CPU_USIII;
+		break;
+	case CHEETAH_PLUS_IMPL:
+		cpu_type = FM_EREPORT_CPU_USIIIplus;
+		break;
+	case JALAPENO_IMPL:
+		cpu_type = FM_EREPORT_CPU_USIIIi;
+		break;
+	case SERRANO_IMPL:
+		cpu_type = FM_EREPORT_CPU_USIIIiplus;
+		break;
+	case JAGUAR_IMPL:
+		cpu_type = FM_EREPORT_CPU_USIV;
+		break;
+	case PANTHER_IMPL:
+		cpu_type = FM_EREPORT_CPU_USIVplus;
+		break;
+	default:
+		cpu_type = FM_EREPORT_CPU_UNSUPPORTED;
+		break;
+	}
+	(void) fm_fmri_cpu_set(detector, FM_CPU_SCHEME_VERSION, NULL,
+	    aflt->flt_inst, (uint8_t)cpunodes[aflt->flt_inst].version,
+	    cpunodes[aflt->flt_inst].device_id);
+
+	/*
+	 * Encode all the common data into the ereport.
+	 */
+	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s.%s",
+		FM_ERROR_CPU, cpu_type, aflt->flt_erpt_class);
+
+	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
+	    fm_ena_generate_cpu(aflt->flt_id, aflt->flt_inst, FM_ENA_FMT1),
+	    detector, NULL);
+
+	/*
+	 * Encode the error specific data that was saved in
+	 * the async_flt structure into the ereport.
+	 */
+	cpu_payload_add_aflt(aflt, ereport, resource,
+	    &plat_ecc_ch_flt.ecaf_afar_status,
+	    &plat_ecc_ch_flt.ecaf_synd_status);
+
+	if (aflt->flt_panic || panicstr) {
+		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
+	} else {
+		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
+		fm_nvlist_destroy(ereport, FM_NVA_FREE);
+		fm_nvlist_destroy(detector, FM_NVA_FREE);
+		fm_nvlist_destroy(resource, FM_NVA_FREE);
+	}
+	/*
+	 * Send the enhanced error information (plat_ecc_error2_data_t)
+	 * to the SC olny if it can process it.
+	 */
+
+	if (&plat_ecc_capability_sc_get &&
+	    plat_ecc_capability_sc_get(PLAT_ECC_ERROR2_MESSAGE)) {
+		msg_type = cpu_flt_bit_to_plat_error(aflt);
+		if (msg_type != PLAT_ECC_ERROR2_NONE) {
+			/*
+			 * If afar status is not invalid do a unum lookup.
+			 */
+			if (plat_ecc_ch_flt.ecaf_afar_status !=
+			    AFLT_STAT_INVALID) {
+				(void) cpu_get_mem_unum_aflt(
+				    plat_ecc_ch_flt.ecaf_synd_status, aflt,
+				    unum, UNUM_NAMLEN, &len);
+			} else {
+				unum[0] = '\0';
+			}
+			plat_ecc_ch_flt.ecaf_sdw_afar = ch_flt->flt_sdw_afar;
+			plat_ecc_ch_flt.ecaf_sdw_afsr = ch_flt->flt_sdw_afsr;
+			plat_ecc_ch_flt.ecaf_afsr_ext = ch_flt->afsr_ext;
+			plat_ecc_ch_flt.ecaf_sdw_afsr_ext =
+			    ch_flt->flt_sdw_afsr_ext;
+
+			if (&plat_log_fruid_error2)
+				plat_log_fruid_error2(msg_type, unum, aflt,
+				    &plat_ecc_ch_flt);
+		}
+	}
+}
+
+void
+cpu_run_bus_error_handlers(struct async_flt *aflt, int expected)
+{
+	int status;
+	ddi_fm_error_t de;
+
+	bzero(&de, sizeof (ddi_fm_error_t));
+
+	de.fme_version = DDI_FME_VERSION;
+	de.fme_ena = fm_ena_generate_cpu(aflt->flt_id, aflt->flt_inst,
+	    FM_ENA_FMT1);
+	de.fme_flag = expected;
+	de.fme_bus_specific = (void *)aflt->flt_addr;
+	status = ndi_fm_handler_dispatch(ddi_root_node(), NULL, &de);
+	if ((aflt->flt_prot == AFLT_PROT_NONE) && (status == DDI_FM_FATAL))
+		aflt->flt_panic = 1;
+}
+
+void
+cpu_errorq_dispatch(char *error_class, void *payload, size_t payload_sz,
+    errorq_t *eqp, uint_t flag)
+{
+	struct async_flt *aflt = (struct async_flt *)payload;
+
+	aflt->flt_erpt_class = error_class;
+	errorq_dispatch(eqp, payload, payload_sz, flag);
+}
+
+/*
+ * This routine may be called by the IO module, but does not do
+ * anything in this cpu module. The SERD algorithm is handled by
+ * cpumem-diagnosis engine instead.
+ */
+/*ARGSUSED*/
+void
+cpu_ce_count_unum(struct async_flt *ecc, int len, char *unum)
+{}
+
+void
+adjust_hw_copy_limits(int ecache_size)
+{
+	/*
+	 * Set hw copy limits.
+	 *
+	 * /etc/system will be parsed later and can override one or more
+	 * of these settings.
+	 *
+	 * At this time, ecache size seems only mildly relevant.
+	 * We seem to run into issues with the d-cache and stalls
+	 * we see on misses.
+	 *
+	 * Cycle measurement indicates that 2 byte aligned copies fare
+	 * little better than doing things with VIS at around 512 bytes.
+	 * 4 byte aligned shows promise until around 1024 bytes. 8 Byte
+	 * aligned is faster whenever the source and destination data
+	 * in cache and the total size is less than 2 Kbytes.  The 2K
+	 * limit seems to be driven by the 2K write cache.
+	 * When more than 2K of copies are done in non-VIS mode, stores
+	 * backup in the write cache.  In VIS mode, the write cache is
+	 * bypassed, allowing faster cache-line writes aligned on cache
+	 * boundaries.
+	 *
+	 * In addition, in non-VIS mode, there is no prefetching, so
+	 * for larger copies, the advantage of prefetching to avoid even
+	 * occasional cache misses is enough to justify using the VIS code.
+	 *
+	 * During testing, it was discovered that netbench ran 3% slower
+	 * when hw_copy_limit_8 was 2K or larger.  Apparently for server
+	 * applications, data is only used once (copied to the output
+	 * buffer, then copied by the network device off the system).  Using
+	 * the VIS copy saves more L2 cache state.  Network copies are
+	 * around 1.3K to 1.5K in size for historical reasons.
+	 *
+	 * Therefore, a limit of 1K bytes will be used for the 8 byte
+	 * aligned copy even for large caches and 8 MB ecache.  The
+	 * infrastructure to allow different limits for different sized
+	 * caches is kept to allow further tuning in later releases.
+	 */
+
+	if (min_ecache_size == 0 && use_hw_bcopy) {
+		/*
+		 * First time through - should be before /etc/system
+		 * is read.
+		 * Could skip the checks for zero but this lets us
+		 * preserve any debugger rewrites.
+		 */
+		if (hw_copy_limit_1 == 0) {
+			hw_copy_limit_1 = VIS_COPY_THRESHOLD;
+			priv_hcl_1 = hw_copy_limit_1;
+		}
+		if (hw_copy_limit_2 == 0) {
+			hw_copy_limit_2 = 2 * VIS_COPY_THRESHOLD;
+			priv_hcl_2 = hw_copy_limit_2;
+		}
+		if (hw_copy_limit_4 == 0) {
+			hw_copy_limit_4 = 4 * VIS_COPY_THRESHOLD;
+			priv_hcl_4 = hw_copy_limit_4;
+		}
+		if (hw_copy_limit_8 == 0) {
+			hw_copy_limit_8 = 4 * VIS_COPY_THRESHOLD;
+			priv_hcl_8 = hw_copy_limit_8;
+		}
+		min_ecache_size = ecache_size;
+	} else {
+		/*
+		 * MP initialization. Called *after* /etc/system has
+		 * been parsed. One CPU has already been initialized.
+		 * Need to cater for /etc/system having scragged one
+		 * of our values.
+		 */
+		if (ecache_size == min_ecache_size) {
+			/*
+			 * Same size ecache. We do nothing unless we
+			 * have a pessimistic ecache setting. In that
+			 * case we become more optimistic (if the cache is
+			 * large enough).
+			 */
+			if (hw_copy_limit_8 == 4 * VIS_COPY_THRESHOLD) {
+				/*
+				 * Need to adjust hw_copy_limit* from our
+				 * pessimistic uniprocessor value to a more
+				 * optimistic UP value *iff* it hasn't been
+				 * reset.
+				 */
+				if ((ecache_size > 1048576) &&
+				    (priv_hcl_8 == hw_copy_limit_8)) {
+					if (ecache_size <= 2097152)
+						hw_copy_limit_8 = 4 *
+						    VIS_COPY_THRESHOLD;
+					else if (ecache_size <= 4194304)
+						hw_copy_limit_8 = 4 *
+						    VIS_COPY_THRESHOLD;
+					else
+						hw_copy_limit_8 = 4 *
+						    VIS_COPY_THRESHOLD;
+					priv_hcl_8 = hw_copy_limit_8;
+				}
+			}
+		} else if (ecache_size < min_ecache_size) {
+			/*
+			 * A different ecache size. Can this even happen?
+			 */
+			if (priv_hcl_8 == hw_copy_limit_8) {
+				/*
+				 * The previous value that we set
+				 * is unchanged (i.e., it hasn't been
+				 * scragged by /etc/system). Rewrite it.
+				 */
+				if (ecache_size <= 1048576)
+					hw_copy_limit_8 = 8 *
+					    VIS_COPY_THRESHOLD;
+				else if (ecache_size <= 2097152)
+					hw_copy_limit_8 = 8 *
+					    VIS_COPY_THRESHOLD;
+				else if (ecache_size <= 4194304)
+					hw_copy_limit_8 = 8 *
+					    VIS_COPY_THRESHOLD;
+				else
+					hw_copy_limit_8 = 10 *
+					    VIS_COPY_THRESHOLD;
+				priv_hcl_8 = hw_copy_limit_8;
+				min_ecache_size = ecache_size;
+			}
+		}
+	}
+}
+
+/*
+ * Called from illegal instruction trap handler to see if we can attribute
+ * the trap to a fpras check.
+ */
+int
+fpras_chktrap(struct regs *rp)
+{
+	int op;
+	struct fpras_chkfngrp *cgp;
+	uintptr_t tpc = (uintptr_t)rp->r_pc;
+
+	if (fpras_chkfngrps == NULL)
+		return (0);
+
+	cgp = &fpras_chkfngrps[CPU->cpu_id];
+	for (op = 0; op < FPRAS_NCOPYOPS; ++op) {
+		if (tpc >= (uintptr_t)&cgp->fpras_fn[op].fpras_blk0 &&
+		    tpc < (uintptr_t)&cgp->fpras_fn[op].fpras_chkresult)
+			break;
+	}
+	if (op == FPRAS_NCOPYOPS)
+		return (0);
+
+	/*
+	 * This is an fpRAS failure caught through an illegal
+	 * instruction - trampoline.
+	 */
+	rp->r_pc = (uintptr_t)&cgp->fpras_fn[op].fpras_trampoline;
+	rp->r_npc = rp->r_pc + 4;
+	return (1);
+}
+
+/*
+ * fpras_failure is called when a fpras check detects a bad calculation
+ * result or an illegal instruction trap is attributed to an fpras
+ * check.  In all cases we are still bound to CPU.
+ */
+int
+fpras_failure(int op, int how)
+{
+	int use_hw_bcopy_orig, use_hw_bzero_orig;
+	uint_t hcl1_orig, hcl2_orig, hcl4_orig, hcl8_orig;
+	ch_async_flt_t ch_flt;
+	struct async_flt *aflt = (struct async_flt *)&ch_flt;
+	struct fpras_chkfn *sfp, *cfp;
+	uint32_t *sip, *cip;
+	int i;
+
+	/*
+	 * We're running on a sick CPU.  Avoid further FPU use at least for
+	 * the time in which we dispatch an ereport and (if applicable) panic.
+	 */
+	use_hw_bcopy_orig = use_hw_bcopy;
+	use_hw_bzero_orig = use_hw_bzero;
+	hcl1_orig = hw_copy_limit_1;
+	hcl2_orig = hw_copy_limit_2;
+	hcl4_orig = hw_copy_limit_4;
+	hcl8_orig = hw_copy_limit_8;
+	use_hw_bcopy = use_hw_bzero = 0;
+	hw_copy_limit_1 = hw_copy_limit_2 = hw_copy_limit_4 =
+	    hw_copy_limit_8 = 0;
+
+	bzero(&ch_flt, sizeof (ch_async_flt_t));
+	aflt->flt_id = gethrtime_waitfree();
+	aflt->flt_class = CPU_FAULT;
+	aflt->flt_inst = CPU->cpu_id;
+	aflt->flt_status = (how << 8) | op;
+	aflt->flt_payload = FM_EREPORT_PAYLOAD_FPU_HWCOPY;
+	ch_flt.flt_type = CPU_FPUERR;
+
+	/*
+	 * We must panic if the copy operation had no lofault protection -
+	 * ie, don't panic for copyin, copyout, kcopy and bcopy called
+	 * under on_fault and do panic for unprotected bcopy and hwblkpagecopy.
+	 */
+	aflt->flt_panic = (curthread->t_lofault == NULL);
+
+	/*
+	 * XOR the source instruction block with the copied instruction
+	 * block - this will show us which bit(s) are corrupted.
+	 */
+	sfp = (struct fpras_chkfn *)fpras_chkfn_type1;
+	cfp = &fpras_chkfngrps[CPU->cpu_id].fpras_fn[op];
+	if (op == FPRAS_BCOPY || op == FPRAS_COPYOUT) {
+		sip = &sfp->fpras_blk0[0];
+		cip = &cfp->fpras_blk0[0];
+	} else {
+		sip = &sfp->fpras_blk1[0];
+		cip = &cfp->fpras_blk1[0];
+	}
+	for (i = 0; i < 16; ++i, ++sip, ++cip)
+		ch_flt.flt_fpdata[i] = *sip ^ *cip;
+
+	cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_FPU_HWCOPY, (void *)&ch_flt,
+	    sizeof (ch_async_flt_t), ue_queue, aflt->flt_panic);
+
+	if (aflt->flt_panic)
+		fm_panic("FPU failure on CPU %d", CPU->cpu_id);
+
+	/*
+	 * We get here for copyin/copyout and kcopy or bcopy where the
+	 * caller has used on_fault.  We will flag the error so that
+	 * the process may be killed  The trap_async_hwerr mechanism will
+	 * take appropriate further action (such as a reboot, contract
+	 * notification etc).  Since we may be continuing we will
+	 * restore the global hardware copy acceleration switches.
+	 *
+	 * When we return from this function to the copy function we want to
+	 * avoid potentially bad data being used, ie we want the affected
+	 * copy function to return an error.  The caller should therefore
+	 * invoke its lofault handler (which always exists for these functions)
+	 * which will return the appropriate error.
+	 */
+	ttolwp(curthread)->lwp_pcb.pcb_flags |= ASYNC_HWERR;
+	aston(curthread);
+
+	use_hw_bcopy = use_hw_bcopy_orig;
+	use_hw_bzero = use_hw_bzero_orig;
+	hw_copy_limit_1 = hcl1_orig;
+	hw_copy_limit_2 = hcl2_orig;
+	hw_copy_limit_4 = hcl4_orig;
+	hw_copy_limit_8 = hcl8_orig;
+
+	return (1);
+}
+
+#define	VIS_BLOCKSIZE		64
+
+int
+dtrace_blksuword32_err(uintptr_t addr, uint32_t *data)
+{
+	int ret, watched;
+
+	watched = watch_disable_addr((void *)addr, VIS_BLOCKSIZE, S_WRITE);
+	ret = dtrace_blksuword32(addr, data, 0);
+	if (watched)
+		watch_enable_addr((void *)addr, VIS_BLOCKSIZE, S_WRITE);
+
+	return (ret);
+}
+
+/*
+ * Called when a cpu enters the CPU_FAULTED state (by the cpu placing the
+ * faulted cpu into that state).  Cross-trap to the faulted cpu to clear
+ * CEEN from the EER to disable traps for further disrupting error types
+ * on that cpu.  We could cross-call instead, but that has a larger
+ * instruction and data footprint than cross-trapping, and the cpu is known
+ * to be faulted.
+ */
+
+void
+cpu_faulted_enter(struct cpu *cp)
+{
+	xt_one(cp->cpu_id, set_error_enable_tl1, EN_REG_CEEN, EER_SET_CLRBITS);
+}
+
+/*
+ * Called when a cpu leaves the CPU_FAULTED state to return to one of
+ * offline, spare, or online (by the cpu requesting this state change).
+ * First we cross-call to clear the AFSR (and AFSR_EXT on Panther) of
+ * disrupting error bits that have accumulated without trapping, then
+ * we cross-trap to re-enable CEEN controlled traps.
+ */
+void
+cpu_faulted_exit(struct cpu *cp)
+{
+	ch_cpu_errors_t cpu_error_regs;
+
+	cpu_error_regs.afsr = C_AFSR_CECC_ERRS;
+	if (IS_PANTHER(cpunodes[cp->cpu_id].implementation))
+		cpu_error_regs.afsr_ext &= C_AFSR_EXT_CECC_ERRS;
+	xc_one(cp->cpu_id, (xcfunc_t *)set_cpu_error_state,
+	    (uint64_t)&cpu_error_regs, 0);
+
+	xt_one(cp->cpu_id, set_error_enable_tl1, EN_REG_CEEN, EER_SET_SETBITS);
+}
+
+/*
+ * Return 1 if the errors in ch_flt's AFSR are secondary errors caused by
+ * the errors in the original AFSR, 0 otherwise.
+ *
+ * For all procs if the initial error was a BERR or TO, then it is possible
+ * that we may have caused a secondary BERR or TO in the process of logging the
+ * inital error via cpu_run_bus_error_handlers().  If this is the case then
+ * if the request was protected then a panic is still not necessary, if not
+ * protected then aft_panic is already set - so either way there's no need
+ * to set aft_panic for the secondary error.
+ *
+ * For Cheetah and Jalapeno, if the original error was a UE which occurred on
+ * a store merge, then the error handling code will call cpu_deferred_error().
+ * When clear_errors() is called, it will determine that secondary errors have
+ * occurred - in particular, the store merge also caused a EDU and WDU that
+ * weren't discovered until this point.
+ *
+ * We do three checks to verify that we are in this case.  If we pass all three
+ * checks, we return 1 to indicate that we should not panic.  If any unexpected
+ * errors occur, we return 0.
+ *
+ * For Cheetah+ and derivative procs, the store merge causes a DUE, which is
+ * handled in cpu_disrupting_errors().  Since this function is not even called
+ * in the case we are interested in, we just return 0 for these processors.
+ */
+/*ARGSUSED*/
+static int
+cpu_check_secondary_errors(ch_async_flt_t *ch_flt, uint64_t t_afsr_errs,
+    uint64_t t_afar)
+{
+#if defined(CHEETAH_PLUS)
+#else	/* CHEETAH_PLUS */
+	struct async_flt *aflt = (struct async_flt *)ch_flt;
+#endif	/* CHEETAH_PLUS */
+
+	/*
+	 * Was the original error a BERR or TO and only a BERR or TO
+	 * (multiple errors are also OK)
+	 */
+	if ((t_afsr_errs & ~(C_AFSR_BERR | C_AFSR_TO | C_AFSR_ME)) == 0) {
+		/*
+		 * Is the new error a BERR or TO and only a BERR or TO
+		 * (multiple errors are also OK)
+		 */
+		if ((ch_flt->afsr_errs &
+		    ~(C_AFSR_BERR | C_AFSR_TO | C_AFSR_ME)) == 0)
+			return (1);
+	}
+
+#if defined(CHEETAH_PLUS)
+	return (0);
+#else	/* CHEETAH_PLUS */
+	/*
+	 * Now look for secondary effects of a UE on cheetah/jalapeno
+	 *
+	 * Check the original error was a UE, and only a UE.  Note that
+	 * the ME bit will cause us to fail this check.
+	 */
+	if (t_afsr_errs != C_AFSR_UE)
+		return (0);
+
+	/*
+	 * Check the secondary errors were exclusively an EDU and/or WDU.
+	 */
+	if ((ch_flt->afsr_errs & ~(C_AFSR_EDU|C_AFSR_WDU)) != 0)
+		return (0);
+
+	/*
+	 * Check the AFAR of the original error and secondary errors
+	 * match to the 64-byte boundary
+	 */
+	if (P2ALIGN(aflt->flt_addr, 64) != P2ALIGN(t_afar, 64))
+		return (0);
+
+	/*
+	 * We've passed all the checks, so it's a secondary error!
+	 */
+	return (1);
+#endif	/* CHEETAH_PLUS */
+}
+
+/*
+ * Translate the flt_bit or flt_type into an error type.  First, flt_bit
+ * is checked for any valid errors.  If found, the error type is
+ * returned. If not found, the flt_type is checked for L1$ parity errors.
+ */
+/*ARGSUSED*/
+static uint8_t
+cpu_flt_bit_to_plat_error(struct async_flt *aflt)
+{
+#if defined(JALAPENO)
+	/*
+	 * Currently, logging errors to the SC is not supported on Jalapeno
+	 */
+	return (PLAT_ECC_ERROR2_NONE);
+#else
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)aflt;
+
+	switch (ch_flt->flt_bit) {
+	case C_AFSR_CE:
+		return (PLAT_ECC_ERROR2_CE);
+	case C_AFSR_UCC:
+	case C_AFSR_EDC:
+	case C_AFSR_WDC:
+	case C_AFSR_CPC:
+		return (PLAT_ECC_ERROR2_L2_CE);
+	case C_AFSR_EMC:
+		return (PLAT_ECC_ERROR2_EMC);
+	case C_AFSR_IVC:
+		return (PLAT_ECC_ERROR2_IVC);
+	case C_AFSR_UE:
+		return (PLAT_ECC_ERROR2_UE);
+	case C_AFSR_UCU:
+	case C_AFSR_EDU:
+	case C_AFSR_WDU:
+	case C_AFSR_CPU:
+		return (PLAT_ECC_ERROR2_L2_UE);
+	case C_AFSR_IVU:
+		return (PLAT_ECC_ERROR2_IVU);
+	case C_AFSR_TO:
+		return (PLAT_ECC_ERROR2_TO);
+	case C_AFSR_BERR:
+		return (PLAT_ECC_ERROR2_BERR);
+#if defined(CHEETAH_PLUS)
+	case C_AFSR_L3_EDC:
+	case C_AFSR_L3_UCC:
+	case C_AFSR_L3_CPC:
+	case C_AFSR_L3_WDC:
+		return (PLAT_ECC_ERROR2_L3_CE);
+	case C_AFSR_IMC:
+		return (PLAT_ECC_ERROR2_IMC);
+	case C_AFSR_TSCE:
+		return (PLAT_ECC_ERROR2_L2_TSCE);
+	case C_AFSR_THCE:
+		return (PLAT_ECC_ERROR2_L2_THCE);
+	case C_AFSR_L3_MECC:
+		return (PLAT_ECC_ERROR2_L3_MECC);
+	case C_AFSR_L3_THCE:
+		return (PLAT_ECC_ERROR2_L3_THCE);
+	case C_AFSR_L3_CPU:
+	case C_AFSR_L3_EDU:
+	case C_AFSR_L3_UCU:
+	case C_AFSR_L3_WDU:
+		return (PLAT_ECC_ERROR2_L3_UE);
+	case C_AFSR_DUE:
+		return (PLAT_ECC_ERROR2_DUE);
+	case C_AFSR_DTO:
+		return (PLAT_ECC_ERROR2_DTO);
+	case C_AFSR_DBERR:
+		return (PLAT_ECC_ERROR2_DBERR);
+#endif	/* CHEETAH_PLUS */
+	default:
+		switch (ch_flt->flt_type) {
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+		case CPU_IC_PARITY:
+			return (PLAT_ECC_ERROR2_IPE);
+		case CPU_DC_PARITY:
+			if (IS_PANTHER(cpunodes[CPU->cpu_id].implementation)) {
+				if (ch_flt->parity_data.dpe.cpl_cache ==
+				    CPU_PC_PARITY) {
+					return (PLAT_ECC_ERROR2_PCACHE);
+				}
+			}
+			return (PLAT_ECC_ERROR2_DPE);
+#endif /* CPU_IMP_L1_CACHE_PARITY */
+		case CPU_ITLB_PARITY:
+			return (PLAT_ECC_ERROR2_ITLB);
+		case CPU_DTLB_PARITY:
+			return (PLAT_ECC_ERROR2_DTLB);
+		default:
+			return (PLAT_ECC_ERROR2_NONE);
+		}
+	}
+#endif	/* JALAPENO */
+}
diff --git a/usr/src/uts/sun4u/cpu/us3_common_asm.s b/usr/src/uts/sun4u/cpu/us3_common_asm.s
new file mode 100644
index 0000000000..8acb0963b2
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_common_asm.s
@@ -0,0 +1,3242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Assembly code support for Cheetah/Cheetah+ modules
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* !lint */
+
+#include <sys/asm_linkage.h>
+#include <sys/mmu.h>
+#include <vm/hat_sfmmu.h>
+#include <sys/machparam.h>
+#include <sys/machcpuvar.h>
+#include <sys/machthread.h>
+#include <sys/machtrap.h>
+#include <sys/privregs.h>
+#include <sys/trap.h>
+#include <sys/cheetahregs.h>
+#include <sys/us3_module.h>
+#include <sys/xc_impl.h>
+#include <sys/intreg.h>
+#include <sys/async.h>
+#include <sys/clock.h>
+#include <sys/cheetahasm.h>
+#include <sys/cmpregs.h>
+
+#ifdef TRAPTRACE
+#include <sys/traptrace.h>
+#endif /* TRAPTRACE */
+
+#if !defined(lint)
+
+/* BEGIN CSTYLED */
+
+#define	DCACHE_FLUSHPAGE(arg1, arg2, tmp1, tmp2, tmp3)			\
+	ldxa	[%g0]ASI_DCU, tmp1					;\
+	btst	DCU_DC, tmp1		/* is dcache enabled? */	;\
+	bz,pn	%icc, 1f						;\
+	ASM_LD(tmp1, dcache_linesize)					;\
+	ASM_LD(tmp2, dflush_type)					;\
+	cmp	tmp2, FLUSHPAGE_TYPE					;\
+	be,pt	%icc, 2f						;\
+	nop								;\
+	sllx	arg1, CHEETAH_DC_VBIT_SHIFT, arg1/* tag to compare */	;\
+	ASM_LD(tmp3, dcache_size)					;\
+	cmp	tmp2, FLUSHMATCH_TYPE					;\
+	be,pt	%icc, 3f						;\
+	nop								;\
+	/*								\
+	 * flushtype = FLUSHALL_TYPE, flush the whole thing		\
+	 * tmp3 = cache size						\
+	 * tmp1 = cache line size					\
+	 */								\
+	sub	tmp3, tmp1, tmp2					;\
+4:									\
+	stxa	%g0, [tmp2]ASI_DC_TAG					;\
+	membar	#Sync							;\
+	cmp	%g0, tmp2						;\
+	bne,pt	%icc, 4b						;\
+	sub	tmp2, tmp1, tmp2					;\
+	ba,pt	%icc, 1f						;\
+	nop								;\
+	/*								\
+	 * flushtype = FLUSHPAGE_TYPE					\
+	 * arg1 = pfn							\
+	 * arg2 = virtual color						\
+	 * tmp1 = cache line size					\
+	 * tmp2 = tag from cache					\
+	 * tmp3 = counter						\
+	 */								\
+2:									\
+	set	MMU_PAGESIZE, tmp3					;\
+        sllx    arg1, MMU_PAGESHIFT, arg1  /* pfn to 43 bit PA	   */   ;\
+	sub	tmp3, tmp1, tmp3					;\
+4:									\
+	stxa	%g0, [arg1 + tmp3]ASI_DC_INVAL				;\
+	membar	#Sync							;\
+5:									\
+	cmp	%g0, tmp3						;\
+	bnz,pt	%icc, 4b		/* branch if not done */	;\
+	sub	tmp3, tmp1, tmp3					;\
+	ba,pt	%icc, 1f						;\
+	nop								;\
+	/*								\
+	 * flushtype = FLUSHMATCH_TYPE					\
+	 * arg1 = tag to compare against				\
+	 * tmp1 = cache line size					\
+	 * tmp3 = cache size						\
+	 * arg2 = counter						\
+	 * tmp2 = cache tag						\
+	 */								\
+3:									\
+	sub	tmp3, tmp1, arg2					;\
+4:									\
+	ldxa	[arg2]ASI_DC_TAG, tmp2		/* read tag */		;\
+	btst	CHEETAH_DC_VBIT_MASK, tmp2				;\
+	bz,pn	%icc, 5f		/* br if no valid sub-blocks */	;\
+	andn	tmp2, CHEETAH_DC_VBIT_MASK, tmp2 /* clear out v bits */	;\
+	cmp	tmp2, arg1						;\
+	bne,pn	%icc, 5f		/* branch if tag miss */	;\
+	nop								;\
+	stxa	%g0, [arg2]ASI_DC_TAG					;\
+	membar	#Sync							;\
+5:									\
+	cmp	%g0, arg2						;\
+	bne,pt	%icc, 4b		/* branch if not done */	;\
+	sub	arg2, tmp1, arg2					;\
+1:
+
+
+/* END CSTYLED */
+
+#endif	/* !lint */
+
+/*
+ * Cheetah MMU and Cache operations.
+ */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vtag_flushpage(caddr_t vaddr, u_int ctxnum)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(vtag_flushpage)
+	/*
+	 * flush page from the tlb
+	 *
+	 * %o0 = vaddr
+	 * %o1 = ctxnum
+	 */
+	rdpr	%pstate, %o5
+#ifdef DEBUG
+	andcc	%o5, PSTATE_IE, %g0		/* if interrupts already */
+	bnz,a,pt %icc, 3f			/* disabled, panic	 */
+	  nop
+	save	%sp, -SA(MINFRAME), %sp
+	sethi	%hi(sfmmu_panic1), %o0
+	call	panic
+	  or	%o0, %lo(sfmmu_panic1), %o0
+	ret
+	restore
+3:
+#endif /* DEBUG */
+	/*
+	 * disable ints
+	 */
+	andn	%o5, PSTATE_IE, %o4
+	wrpr	%o4, 0, %pstate
+
+	/*
+	 * Then, blow out the tlb
+	 * Interrupts are disabled to prevent the primary ctx register
+	 * from changing underneath us.
+	 */
+	brnz,pt	%o1, 1f			/* KCONTEXT */
+	sethi	%hi(FLUSH_ADDR), %o3
+	/*
+	 * For KCONTEXT demaps use primary. type = page implicitly
+	 */
+	stxa	%g0, [%o0]ASI_DTLB_DEMAP	/* dmmu flush for KCONTEXT */
+	stxa	%g0, [%o0]ASI_ITLB_DEMAP	/* immu flush for KCONTEXT */
+	flush	%o3
+	b	5f
+	nop
+1:
+	/*
+	 * User demap.  We need to set the primary context properly.
+	 * Secondary context cannot be used for Cheetah IMMU.
+	 * %o0 = vaddr
+	 * %o1 = ctxnum
+	 * %o3 = FLUSH_ADDR
+	 */
+	sethi	%hi(ctx_pgsz_array), %o4
+	ldn     [%o4 + %lo(ctx_pgsz_array)], %o4
+	brz	%o4, 2f
+	nop
+	ldub	[%o4 + %o1], %o4
+	sll	%o4, CTXREG_EXT_SHIFT, %o4
+	or	%o1, %o4, %o1
+2:
+	wrpr	%g0, 1, %tl
+	set	MMU_PCONTEXT, %o4
+	or	DEMAP_PRIMARY | DEMAP_PAGE_TYPE, %o0, %o0
+	ldxa	[%o4]ASI_DMMU, %o2		/* rd old ctxnum */
+	stxa	%o1, [%o4]ASI_DMMU		/* wr new ctxum */
+4:
+	stxa	%g0, [%o0]ASI_DTLB_DEMAP
+	stxa	%g0, [%o0]ASI_ITLB_DEMAP
+	stxa	%o2, [%o4]ASI_DMMU		/* restore old ctxnum */
+	flush	%o3
+	wrpr	%g0, 0, %tl
+5:
+	retl
+	wrpr	%g0, %o5, %pstate		/* enable interrupts */
+	SET_SIZE(vtag_flushpage)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vtag_flushctx(u_int ctxnum)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(vtag_flushctx)
+	/*
+	 * flush context from the tlb
+	 *
+	 * %o0 = ctxnum
+	 * We disable interrupts to prevent the primary ctx register changing
+	 * underneath us.
+	 */
+	sethi	%hi(FLUSH_ADDR), %o3
+	rdpr	%pstate, %o2
+
+#ifdef DEBUG
+	andcc	%o2, PSTATE_IE, %g0		/* if interrupts already */
+	bnz,a,pt %icc, 1f			/* disabled, panic	 */
+	  nop
+	sethi	%hi(sfmmu_panic1), %o0
+	call	panic
+	  or	%o0, %lo(sfmmu_panic1), %o0
+1:
+#endif /* DEBUG */
+
+	sethi	%hi(ctx_pgsz_array), %o4
+	ldn     [%o4 + %lo(ctx_pgsz_array)], %o4
+	brz	%o4, 2f
+	nop
+	ldub	[%o4 + %o0], %o4
+	sll	%o4, CTXREG_EXT_SHIFT, %o4
+	or	%o0, %o4, %o0
+2:
+	wrpr	%o2, PSTATE_IE, %pstate		/* disable interrupts */
+	set	MMU_PCONTEXT, %o4
+	set	DEMAP_CTX_TYPE | DEMAP_PRIMARY, %g1
+	wrpr	%g0, 1, %tl
+	ldxa	[%o4]ASI_DMMU, %o5		/* rd old ctxnum */
+	stxa	%o0, [%o4]ASI_DMMU		/* wr new ctxum */
+4:
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	stxa	%o5, [%o4]ASI_DMMU		/* restore old ctxnum */
+	flush	%o3
+	wrpr	%g0, 0, %tl
+5:
+	retl
+	wrpr	%g0, %o2, %pstate		/* enable interrupts */
+	SET_SIZE(vtag_flushctx)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+void
+vtag_flushall(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP2(vtag_flushall, demap_all)
+	/*
+	 * flush the tlb
+	 */
+	sethi	%hi(FLUSH_ADDR), %o3
+	set	DEMAP_ALL_TYPE, %g1
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	flush	%o3
+	retl
+	nop
+	SET_SIZE(demap_all)
+	SET_SIZE(vtag_flushall)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vtag_flushpage_tl1(uint64_t vaddr, uint64_t ctxnum)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(vtag_flushpage_tl1)
+	/*
+	 * x-trap to flush page from tlb and tsb
+	 *
+	 * %g1 = vaddr, zero-extended on 32-bit kernel
+	 * %g2 = ctxnum
+	 *
+	 * assumes TSBE_TAG = 0
+	 */
+	srln	%g1, MMU_PAGESHIFT, %g1
+	brnz,pt	%g2, 1f				/* KCONTEXT */
+	  slln	%g1, MMU_PAGESHIFT, %g1		/* g1 = vaddr */
+
+	/* We need to demap in the kernel context */
+	or	DEMAP_NUCLEUS | DEMAP_PAGE_TYPE, %g1, %g1
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	retry
+1:
+	/* We need to demap in a user context */
+	or	DEMAP_PRIMARY | DEMAP_PAGE_TYPE, %g1, %g1
+	sethi	%hi(ctx_pgsz_array), %g4
+	ldn     [%g4 + %lo(ctx_pgsz_array)], %g4
+	brz	%g4, 2f
+	nop
+	ldub	[%g4 + %g2], %g4
+	sll	%g4, CTXREG_EXT_SHIFT, %g4
+	or	%g2, %g4, %g2
+2:
+	set	MMU_PCONTEXT, %g4
+	ldxa	[%g4]ASI_DMMU, %g5		/* rd old ctxnum */
+	stxa	%g2, [%g4]ASI_DMMU		/* wr new ctxum */
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	stxa	%g5, [%g4]ASI_DMMU		/* restore old ctxnum */
+	retry
+	SET_SIZE(vtag_flushpage_tl1)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vtag_flush_pgcnt_tl1(uint64_t vaddr, uint64_t ctx_pgcnt)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(vtag_flush_pgcnt_tl1)
+	/*
+	 * x-trap to flush pgcnt MMU_PAGESIZE pages from tlb
+	 *
+	 * %g1 = vaddr, zero-extended on 32-bit kernel
+	 * %g2 = <zero32|ctx16|pgcnt16>
+	 *
+	 * NOTE: this handler relies on the fact that no
+	 *	interrupts or traps can occur during the loop
+	 *	issuing the TLB_DEMAP operations. It is assumed
+	 *	that interrupts are disabled and this code is
+	 *	fetching from the kernel locked text address.
+	 *
+	 * assumes TSBE_TAG = 0
+	 */
+	set	0xffff, %g4
+	and	%g4, %g2, %g3			/* g3 = pgcnt */
+	srln	%g2, 16, %g2			/* g2 = ctxnum */
+	srln	%g1, MMU_PAGESHIFT, %g1
+	brnz,pt	%g2, 1f				/* KCONTEXT? */
+	  slln	%g1, MMU_PAGESHIFT, %g1		/* g1 = vaddr */
+
+	/* We need to demap in the kernel context */
+	or	DEMAP_NUCLEUS | DEMAP_PAGE_TYPE, %g1, %g1
+	set	MMU_PAGESIZE, %g2		/* g2 = pgsize */
+4:
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	deccc	%g3				/* decr pgcnt */
+	bnz,pt	%icc,4b
+	  add	%g1, %g2, %g1			/* next page */
+	retry
+1:
+	/* We need to demap in a user context */
+	sethi	%hi(ctx_pgsz_array), %g4
+	ldn     [%g4 + %lo(ctx_pgsz_array)], %g4
+	brz	%g4, 2f
+	  or	DEMAP_PRIMARY | DEMAP_PAGE_TYPE, %g1, %g1
+	ldub	[%g4 + %g2], %g4
+	sll	%g4, CTXREG_EXT_SHIFT, %g4
+	or	%g2, %g4, %g2
+2:
+	set	MMU_PCONTEXT, %g4
+	ldxa	[%g4]ASI_DMMU, %g5		/* rd old ctxnum */
+	stxa	%g2, [%g4]ASI_DMMU		/* wr new ctxum */
+
+	set	MMU_PAGESIZE, %g2		/* g2 = pgsize */
+3:
+	stxa	%g0, [%g1]ASI_DTLB_DEMAP
+	stxa	%g0, [%g1]ASI_ITLB_DEMAP
+	deccc	%g3				/* decr pgcnt */
+	bnz,pt	%icc,3b
+	  add	%g1, %g2, %g1			/* next page */
+
+	stxa	%g5, [%g4]ASI_DMMU		/* restore old ctxnum */
+	retry
+	SET_SIZE(vtag_flush_pgcnt_tl1)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vtag_flushctx_tl1(uint64_t ctxnum, uint64_t dummy)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(vtag_flushctx_tl1)
+	/*
+	 * x-trap to flush context from tlb
+	 *
+	 * %g1 = ctxnum
+	 */
+	sethi	%hi(ctx_pgsz_array), %g4
+	ldn     [%g4 + %lo(ctx_pgsz_array)], %g4
+	brz	%g4, 2f
+	nop
+	ldub	[%g4 + %g1], %g4
+	sll	%g4, CTXREG_EXT_SHIFT, %g4
+	or	%g1, %g4, %g1
+2:
+	set	DEMAP_CTX_TYPE | DEMAP_PRIMARY, %g4
+	set	MMU_PCONTEXT, %g3
+	ldxa	[%g3]ASI_DMMU, %g5		/* rd old ctxnum */
+	stxa	%g1, [%g3]ASI_DMMU		/* wr new ctxum */
+	stxa	%g0, [%g4]ASI_DTLB_DEMAP
+	stxa	%g0, [%g4]ASI_ITLB_DEMAP
+	stxa	%g5, [%g3]ASI_DMMU		/* restore old ctxnum */
+	retry
+	SET_SIZE(vtag_flushctx_tl1)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+vtag_flushall_tl1(uint64_t dummy1, uint64_t dummy2)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(vtag_flushall_tl1)
+	/*
+	 * x-trap to flush tlb
+	 */
+	set	DEMAP_ALL_TYPE, %g4
+	stxa	%g0, [%g4]ASI_DTLB_DEMAP
+	stxa	%g0, [%g4]ASI_ITLB_DEMAP
+	retry
+	SET_SIZE(vtag_flushall_tl1)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vac_flushpage(pfn_t pfnum, int vcolor)
+{}
+
+#else	/* lint */
+
+/*
+ * vac_flushpage(pfnum, color)
+ *	Flush 1 8k page of the D-$ with physical page = pfnum
+ *	Algorithm:
+ *		The cheetah dcache is a 64k psuedo 4 way accaociative cache.
+ *		It is virtual indexed, physically tagged cache.
+ */
+	.seg	".data"
+	.align	8
+	.global	dflush_type
+dflush_type:
+	.word	FLUSHPAGE_TYPE
+
+	ENTRY(vac_flushpage)
+	/*
+	 * flush page from the d$
+	 *
+	 * %o0 = pfnum, %o1 = color
+	 */
+	DCACHE_FLUSHPAGE(%o0, %o1, %o2, %o3, %o4)
+	retl
+	  nop
+	SET_SIZE(vac_flushpage)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vac_flushpage_tl1(uint64_t pfnum, uint64_t vcolor)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(vac_flushpage_tl1)
+	/*
+	 * x-trap to flush page from the d$
+	 *
+	 * %g1 = pfnum, %g2 = color
+	 */
+	DCACHE_FLUSHPAGE(%g1, %g2, %g3, %g4, %g5)
+	retry
+	SET_SIZE(vac_flushpage_tl1)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vac_flushcolor(int vcolor, pfn_t pfnum)
+{}
+
+#else	/* lint */
+	/*
+	 * In UltraSPARC III flushcolor is same as as flushpage.
+	 * This is because we have an ASI to flush dcache using physical
+	 * address.
+	 * Flushing dcache using physical address is faster because we
+	 * don't have to deal with associativity of dcache.
+	 * The arguments to vac_flushpage() and vac_flushcolor() are same but
+	 * the order is reversed. this is because we maintain compatibility
+	 * with spitfire, in which vac_flushcolor has only one argument, namely
+	 * vcolor.
+	 */
+
+	ENTRY(vac_flushcolor)
+	/*
+	 * %o0 = vcolor, %o1 = pfnum
+	 */
+	DCACHE_FLUSHPAGE(%o1, %o0, %o2, %o3, %o4)
+	retl
+	  nop
+	SET_SIZE(vac_flushcolor)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+vac_flushcolor_tl1(uint64_t vcolor, uint64_t pfnum)
+{}
+
+#else	/* lint */
+
+	ENTRY(vac_flushcolor_tl1)
+	/*
+	 * %g1 = vcolor
+	 * %g2 = pfnum
+	 */
+	DCACHE_FLUSHPAGE(%g2, %g1, %g3, %g4, %g5)
+	retry
+	SET_SIZE(vac_flushcolor_tl1)
+
+#endif	/* lint */
+
+#if defined(lint)
+ 
+int
+idsr_busy(void)
+{
+	return (0);
+}
+
+#else	/* lint */
+
+/*
+ * Determine whether or not the IDSR is busy.
+ * Entry: no arguments
+ * Returns: 1 if busy, 0 otherwise
+ */
+	ENTRY(idsr_busy)
+	ldxa	[%g0]ASI_INTR_DISPATCH_STATUS, %g1
+	clr	%o0
+	btst	IDSR_BUSY, %g1
+	bz,a,pt	%xcc, 1f
+	mov	1, %o0
+1:
+	retl
+	nop
+	SET_SIZE(idsr_busy)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+init_mondo(xcfunc_t *func, uint64_t arg1, uint64_t arg2)
+{}
+
+/* ARGSUSED */
+void
+init_mondo_nocheck(xcfunc_t *func, uint64_t arg1, uint64_t arg2)
+{}
+
+#else	/* lint */
+
+	.global _dispatch_status_busy
+_dispatch_status_busy:
+	.asciz	"ASI_INTR_DISPATCH_STATUS error: busy"
+	.align	4
+
+/*
+ * Setup interrupt dispatch data registers
+ * Entry:
+ *	%o0 - function or inumber to call
+ *	%o1, %o2 - arguments (2 uint64_t's)
+ */
+	.seg "text"
+
+	ENTRY(init_mondo)
+#ifdef DEBUG
+	!
+	! IDSR should not be busy at the moment
+	!
+	ldxa	[%g0]ASI_INTR_DISPATCH_STATUS, %g1
+	btst	IDSR_BUSY, %g1
+	bz,pt	%xcc, 1f
+	nop
+	sethi	%hi(_dispatch_status_busy), %o0
+	call	panic
+	or	%o0, %lo(_dispatch_status_busy), %o0
+#endif /* DEBUG */
+
+	ALTENTRY(init_mondo_nocheck)
+	!
+	! interrupt vector dispatch data reg 0
+	!
+1:
+	mov	IDDR_0, %g1
+	mov	IDDR_1, %g2
+	mov	IDDR_2, %g3
+	stxa	%o0, [%g1]ASI_INTR_DISPATCH
+
+	!
+	! interrupt vector dispatch data reg 1
+	!
+	stxa	%o1, [%g2]ASI_INTR_DISPATCH
+
+	!
+	! interrupt vector dispatch data reg 2
+	!
+	stxa	%o2, [%g3]ASI_INTR_DISPATCH
+
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(init_mondo_nocheck)
+	SET_SIZE(init_mondo)
+
+#endif	/* lint */
+
+
+#if !(defined(JALAPENO) || defined(SERRANO))
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+shipit(int upaid, int bn)
+{ return; }
+
+#else	/* lint */
+
+/*
+ * Ship mondo to aid using busy/nack pair bn
+ */
+	ENTRY_NP(shipit)
+	sll	%o0, IDCR_PID_SHIFT, %g1	! IDCR<18:14> = agent id
+	sll	%o1, IDCR_BN_SHIFT, %g2		! IDCR<28:24> = b/n pair
+	or	%g1, IDCR_OFFSET, %g1		! IDCR<13:0> = 0x70
+	or	%g1, %g2, %g1
+	stxa	%g0, [%g1]ASI_INTR_DISPATCH	! interrupt vector dispatch
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(shipit)
+
+#endif	/* lint */
+
+#endif	/* !(JALAPENO || SERRANO) */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+flush_instr_mem(caddr_t vaddr, size_t len)
+{}
+
+#else	/* lint */
+
+/*
+ * flush_instr_mem:
+ *	Flush 1 page of the I-$ starting at vaddr
+ * 	%o0 vaddr
+ *	%o1 bytes to be flushed
+ * UltraSPARC-III maintains consistency of the on-chip Instruction Cache with
+ * the stores from all processors so that a FLUSH instruction is only needed
+ * to ensure pipeline is consistent. This means a single flush is sufficient at
+ * the end of a sequence of stores that updates the instruction stream to
+ * ensure correct operation.
+ */
+
+	ENTRY(flush_instr_mem)
+	flush	%o0			! address irrelevent
+	retl
+	nop
+	SET_SIZE(flush_instr_mem)
+
+#endif	/* lint */
+
+
+#if defined(CPU_IMP_ECACHE_ASSOC)
+
+#if defined(lint)
+
+/* ARGSUSED */
+uint64_t
+get_ecache_ctrl(void)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(get_ecache_ctrl)
+	GET_CPU_IMPL(%o0)
+	cmp	%o0, JAGUAR_IMPL
+	!
+	! Putting an ASI access in the delay slot may
+	! cause it to be accessed, even when annulled.
+	!
+	bne	1f
+	  nop
+	ldxa	[%g0]ASI_EC_CFG_TIMING, %o0	! read Jaguar shared E$ ctrl reg
+	b	2f
+	  nop
+1:	
+	ldxa	[%g0]ASI_EC_CTRL, %o0		! read Ch/Ch+ E$ control reg
+2:
+	retl
+	  nop
+	SET_SIZE(get_ecache_ctrl)
+
+#endif	/* lint */
+
+#endif	/* CPU_IMP_ECACHE_ASSOC */
+
+
+#if !(defined(JALAPENO) || defined(SERRANO))
+
+/*
+ * flush_ecache:
+ *	%o0 - 64 bit physical address
+ *	%o1 - ecache size
+ *	%o2 - ecache linesize
+ */
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+flush_ecache(uint64_t physaddr, size_t ecache_size, size_t ecache_linesize)
+{}
+
+#else /* !lint */
+
+	ENTRY(flush_ecache)
+
+	/*
+	 * For certain CPU implementations, we have to flush the L2 cache
+	 * before flushing the ecache.
+	 */
+	PN_L2_FLUSHALL(%g3, %g4, %g5)
+
+	/*
+	 * Flush the entire Ecache using displacement flush.
+	 */
+	ECACHE_FLUSHALL(%o1, %o2, %o0, %o4)
+
+	retl
+	nop
+	SET_SIZE(flush_ecache)
+
+#endif /* lint */
+
+#endif	/* !(JALAPENO || SERRANO) */
+
+
+#if defined(lint)
+
+void
+flush_dcache(void)
+{}
+
+#else	/* lint */
+
+	ENTRY(flush_dcache)
+	ASM_LD(%o0, dcache_size)
+	ASM_LD(%o1, dcache_linesize)
+	CH_DCACHE_FLUSHALL(%o0, %o1, %o2)
+	retl
+	nop
+	SET_SIZE(flush_dcache)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+void
+flush_icache(void)
+{}
+
+#else	/* lint */
+
+	ENTRY(flush_icache)
+	GET_CPU_PRIVATE_PTR(%g0, %o0, %o2, flush_icache_1);
+	ld	[%o0 + CHPR_ICACHE_LINESIZE], %o1
+	ba,pt	%icc, 2f
+	  ld	[%o0 + CHPR_ICACHE_SIZE], %o0
+flush_icache_1:
+	ASM_LD(%o0, icache_size)
+	ASM_LD(%o1, icache_linesize)
+2:
+	CH_ICACHE_FLUSHALL(%o0, %o1, %o2, %o4)
+	retl
+	nop
+	SET_SIZE(flush_icache)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+kdi_flush_idcache(int dcache_size, int dcache_lsize, int icache_size, 
+    int icache_lsize)
+{
+}
+
+#else	/* lint */
+
+	ENTRY(kdi_flush_idcache)
+	CH_DCACHE_FLUSHALL(%o0, %o1, %g1)
+	CH_ICACHE_FLUSHALL(%o2, %o3, %g1, %g2)
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(kdi_flush_idcache)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+void
+flush_pcache(void)
+{}
+
+#else	/* lint */
+
+	ENTRY(flush_pcache)
+	PCACHE_FLUSHALL(%o0, %o1, %o2)
+	retl
+	nop
+	SET_SIZE(flush_pcache)
+
+#endif	/* lint */
+
+
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+get_dcache_dtag(uint32_t dcache_idx, uint64_t *data)
+{}
+
+#else	/* lint */
+
+/*
+ * Get dcache data and tag.  The Dcache data is a pointer to a ch_dc_data_t
+ * structure (see cheetahregs.h):
+ * The Dcache *should* be turned off when this code is executed.
+ */
+	.align	128
+	ENTRY(get_dcache_dtag)
+	rdpr	%pstate, %o5
+	andn    %o5, PSTATE_IE | PSTATE_AM, %o3
+	wrpr	%g0, %o3, %pstate
+	b	1f
+	  stx	%o0, [%o1 + CH_DC_IDX]
+
+	.align	128
+1:
+	ldxa	[%o0]ASI_DC_TAG, %o2
+	stx	%o2, [%o1 + CH_DC_TAG]
+	membar	#Sync
+	ldxa	[%o0]ASI_DC_UTAG, %o2
+	membar	#Sync
+	stx	%o2, [%o1 + CH_DC_UTAG]
+	ldxa	[%o0]ASI_DC_SNP_TAG, %o2
+	stx	%o2, [%o1 + CH_DC_SNTAG]
+	add	%o1, CH_DC_DATA, %o1
+	clr	%o3
+2:
+	membar	#Sync				! required before ASI_DC_DATA
+	ldxa	[%o0 + %o3]ASI_DC_DATA, %o2
+	membar	#Sync				! required after ASI_DC_DATA
+	stx	%o2, [%o1 + %o3]
+	cmp	%o3, CH_DC_DATA_REG_SIZE - 8
+	blt	2b
+	  add	%o3, 8, %o3
+
+	/*
+	 * Unlike other CPUs in the family, D$ data parity bits for Panther
+	 * do not reside in the microtag. Instead, we have to read them
+	 * using the DC_data_parity bit of ASI_DCACHE_DATA. Also, instead
+	 * of just having 8 parity bits to protect all 32 bytes of data
+	 * per line, we now have 32 bits of parity.
+	 */
+	GET_CPU_IMPL(%o3)
+	cmp	%o3, PANTHER_IMPL
+	bne	4f
+	  clr	%o3
+
+	/*
+	 * move our pointer to the next field where we store parity bits
+	 * and add the offset of the last parity byte since we will be
+	 * storing all 4 parity bytes within one 64 bit field like this:
+	 *
+	 * +------+------------+------------+------------+------------+
+	 * |  -   | DC_parity  | DC_parity  | DC_parity  | DC_parity  |
+	 * |  -   | for word 3 | for word 2 | for word 1 | for word 0 |
+	 * +------+------------+------------+------------+------------+
+	 *  63:32     31:24        23:16         15:8          7:0     
+	 */
+	add	%o1, CH_DC_PN_DATA_PARITY - CH_DC_DATA + 7, %o1
+
+	/* add the DC_data_parity bit into our working index */
+	mov	1, %o2
+	sll	%o2, PN_DC_DATA_PARITY_BIT_SHIFT, %o2
+	or	%o0, %o2, %o0
+3:
+	membar	#Sync				! required before ASI_DC_DATA
+	ldxa	[%o0 + %o3]ASI_DC_DATA, %o2
+	membar	#Sync				! required after ASI_DC_DATA
+	stb	%o2, [%o1]
+	dec	%o1
+	cmp	%o3, CH_DC_DATA_REG_SIZE - 8
+	blt	3b
+	  add	%o3, 8, %o3
+4:
+	retl
+	  wrpr	%g0, %o5, %pstate	
+	SET_SIZE(get_dcache_dtag)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+get_icache_dtag(uint32_t ecache_idx, uint64_t *data)
+{}
+
+#else	/* lint */
+
+/*
+ * Get icache data and tag.  The data argument is a pointer to a ch_ic_data_t
+ * structure (see cheetahregs.h):
+ * The Icache *Must* be turned off when this function is called.
+ * This is because diagnostic accesses to the Icache interfere with cache
+ * consistency.
+ */
+	.align	128
+	ENTRY(get_icache_dtag)
+	rdpr	%pstate, %o5
+	andn    %o5, PSTATE_IE | PSTATE_AM, %o3
+	wrpr	%g0, %o3, %pstate
+
+	stx	%o0, [%o1 + CH_IC_IDX]
+	ldxa	[%o0]ASI_IC_TAG, %o2
+	stx	%o2, [%o1 + CH_IC_PATAG]
+	add	%o0, CH_ICTAG_UTAG, %o0
+	ldxa	[%o0]ASI_IC_TAG, %o2
+	add	%o0, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), %o0
+	stx	%o2, [%o1 + CH_IC_UTAG]
+	ldxa	[%o0]ASI_IC_TAG, %o2
+	add	%o0, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), %o0
+	stx	%o2, [%o1 + CH_IC_UPPER]
+	ldxa	[%o0]ASI_IC_TAG, %o2
+	andn	%o0, CH_ICTAG_TMASK, %o0
+	stx	%o2, [%o1 + CH_IC_LOWER]
+	ldxa	[%o0]ASI_IC_SNP_TAG, %o2
+	stx	%o2, [%o1 + CH_IC_SNTAG]
+	add	%o1, CH_IC_DATA, %o1
+	clr	%o3
+2:
+	ldxa	[%o0 + %o3]ASI_IC_DATA, %o2
+	stx	%o2, [%o1 + %o3]
+	cmp	%o3, PN_IC_DATA_REG_SIZE - 8
+	blt	2b
+	  add	%o3, 8, %o3
+
+	retl
+	  wrpr	%g0, %o5, %pstate	
+	SET_SIZE(get_icache_dtag)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+get_pcache_dtag(uint32_t pcache_idx, uint64_t *data)
+{}
+
+#else	/* lint */
+
+/*
+ * Get pcache data and tags.
+ * inputs:
+ *   pcache_idx	- fully constructed VA for for accessing P$ diagnostic
+ *		  registers. Contains PC_way and PC_addr shifted into
+ *		  the correct bit positions. See the PRM for more details.
+ *   data	- pointer to a ch_pc_data_t
+ * structure (see cheetahregs.h):
+ */
+	.align	128
+	ENTRY(get_pcache_dtag)
+	rdpr	%pstate, %o5
+	andn    %o5, PSTATE_IE | PSTATE_AM, %o3
+	wrpr	%g0, %o3, %pstate
+
+	stx	%o0, [%o1 + CH_PC_IDX]
+	ldxa	[%o0]ASI_PC_STATUS_DATA, %o2
+	stx	%o2, [%o1 + CH_PC_STATUS]
+	ldxa	[%o0]ASI_PC_TAG, %o2
+	stx	%o2, [%o1 + CH_PC_TAG]
+	ldxa	[%o0]ASI_PC_SNP_TAG, %o2
+	stx	%o2, [%o1 + CH_PC_SNTAG]
+	add	%o1, CH_PC_DATA, %o1
+	clr	%o3
+2:
+	ldxa	[%o0 + %o3]ASI_PC_DATA, %o2
+	stx	%o2, [%o1 + %o3]
+	cmp	%o3, CH_PC_DATA_REG_SIZE - 8
+	blt	2b
+	  add	%o3, 8, %o3
+
+	retl
+	  wrpr	%g0, %o5, %pstate	
+	SET_SIZE(get_pcache_dtag)
+
+#endif	/* lint */
+
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+set_dcu(uint64_t dcu)
+{}
+
+#else	/* lint */
+
+/*
+ * re-enable the i$, d$, w$, and p$ according to bootup cache state.
+ * Turn on WE, HPE, SPE, PE, IC, and DC bits defined as DCU_CACHE.
+ *   %o0 - 64 bit constant
+ */
+	ENTRY(set_dcu)
+	stxa	%o0, [%g0]ASI_DCU	! Store to DCU
+	flush	%g0	/* flush required after changing the IC bit */
+	retl
+	nop
+	SET_SIZE(set_dcu)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+uint64_t
+get_dcu(void)
+{
+	return ((uint64_t)0);
+}
+
+#else	/* lint */
+
+/*
+ * Return DCU register.
+ */
+	ENTRY(get_dcu)
+	ldxa	[%g0]ASI_DCU, %o0		/* DCU control register */
+	retl
+	nop
+	SET_SIZE(get_dcu)
+
+#endif	/* lint */
+
+/*
+ * Cheetah/Cheetah+ level 15 interrupt handler trap table entry.
+ *
+ * This handler is used to check for softints generated by error trap
+ * handlers to report errors.  On Cheetah, this mechanism is used by the
+ * Fast ECC at TL>0 error trap handler and, on Cheetah+, by both the Fast
+ * ECC at TL>0 error and the I$/D$ parity error at TL>0 trap handlers.
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+#if defined(lint)
+
+void
+ch_pil15_interrupt_instr(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(ch_pil15_interrupt_instr)
+	ASM_JMP(%g1, ch_pil15_interrupt)
+	SET_SIZE(ch_pil15_interrupt_instr)
+
+#endif
+
+
+#if defined(lint)
+
+void
+ch_pil15_interrupt(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(ch_pil15_interrupt)
+
+	/*
+	 * Since pil_interrupt is hacked to assume that every level 15
+	 * interrupt is generated by the CPU to indicate a performance
+	 * counter overflow this gets ugly.  Before calling pil_interrupt
+	 * the Error at TL>0 pending status is inspected.  If it is
+	 * non-zero, then an error has occurred and it is handled.
+	 * Otherwise control is transfered to pil_interrupt.  Note that if
+	 * an error is detected pil_interrupt will not be called and
+	 * overflow interrupts may be lost causing erroneous performance
+	 * measurements.  However, error-recovery will have a detrimental
+	 * effect on performance anyway.
+	 */
+	CPU_INDEX(%g1, %g4)
+	set	ch_err_tl1_pending, %g4
+	ldub	[%g1 + %g4], %g2
+	brz	%g2, 1f
+	  nop
+
+	/*
+	 * We have a pending TL>0 error, clear the TL>0 pending status.
+	 */
+	stb	%g0, [%g1 + %g4]
+
+	/*
+	 * Clear the softint.
+	 */
+	mov	1, %g5
+	sll	%g5, PIL_15, %g5
+	wr	%g5, CLEAR_SOFTINT
+
+	/*
+	 * For Cheetah*, call cpu_tl1_error via systrap at PIL 15
+	 * to process the Fast ECC/Cache Parity at TL>0 error.  Clear
+	 * panic flag (%g2).
+	 */
+	set	cpu_tl1_error, %g1
+	clr	%g2
+	ba	sys_trap
+	  mov	PIL_15, %g4
+
+1:
+	/*
+	 * The logout is invalid.
+	 *
+	 * Call the default interrupt handler.
+	 */
+	sethi	%hi(pil_interrupt), %g1
+	jmp	%g1 + %lo(pil_interrupt)
+	  mov	PIL_15, %g4
+
+	SET_SIZE(ch_pil15_interrupt)
+#endif
+
+
+/*
+ * Error Handling
+ *
+ * Cheetah provides error checking for all memory access paths between
+ * the CPU, External Cache, Cheetah Data Switch and system bus. Error
+ * information is logged in the AFSR, (also AFSR_EXT for Panther) and
+ * AFAR and one of the following traps is generated (provided that it
+ * is enabled in External Cache Error Enable Register) to handle that
+ * error:
+ * 1. trap 0x70: Precise trap 
+ *    tt0_fecc for errors at trap level(TL)>=0
+ * 2. trap 0x0A and 0x32: Deferred trap
+ *    async_err for errors at TL>=0
+ * 3. trap 0x63: Disrupting trap
+ *    ce_err for errors at TL=0
+ *    (Note that trap 0x63 cannot happen at trap level > 0)
+ *
+ * Trap level one handlers panic the system except for the fast ecc
+ * error handler which tries to recover from certain errors.
+ */
+
+/*
+ * FAST ECC TRAP STRATEGY:
+ *
+ * Software must handle single and multi bit errors which occur due to data
+ * or instruction cache reads from the external cache. A single or multi bit
+ * error occuring in one of these situations results in a precise trap.
+ *
+ * The basic flow of this trap handler is as follows:
+ *
+ * 1) Record the state and then turn off the Dcache and Icache.  The Dcache
+ *    is disabled because bad data could have been installed.  The Icache is
+ *    turned off because we want to capture the Icache line related to the
+ *    AFAR.
+ * 2) Disable trapping on CEEN/NCCEN errors during TL=0 processing.
+ * 3) Park sibling core if caches are shared (to avoid race condition while
+ *    accessing shared resources such as L3 data staging register during
+ *    CPU logout.
+ * 4) Read the AFAR and AFSR.
+ * 5) If CPU logout structure is not being used, then:
+ *    6) Clear all errors from the AFSR.
+ *    7) Capture Ecache, Dcache and Icache lines in "CPU log out" structure.
+ *    8) Flush Ecache then Flush Dcache and Icache and restore to previous
+ *       state.
+ *    9) Unpark sibling core if we parked it earlier.
+ *    10) call cpu_fast_ecc_error via systrap at PIL 14 unless we're already
+ *        running at PIL 15.
+ * 6) Otherwise, if CPU logout structure is being used:
+ *    7) Incriment the "logout busy count".
+ *    8) Flush Ecache then Flush Dcache and Icache and restore to previous
+ *       state.
+ *    9) Unpark sibling core if we parked it earlier.
+ *    10) Issue a retry since the other CPU error logging code will end up
+ *       finding this error bit and logging information about it later.
+ * 7) Alternatively (to 5 and 6 above), if the cpu_private struct is not
+ *    yet initialized such that we can't even check the logout struct, then
+ *    we place the clo_flags data into %g2 (sys_trap->have_win arg #1) and
+ *    call cpu_fast_ecc_error via systrap. The clo_flags parameter is used
+ *    to determine information such as TL, TT, CEEN settings, etc in the
+ *    high level trap handler since we don't have access to detailed logout
+ *    information in cases where the cpu_private struct is not yet
+ *    initialized.
+ *
+ * We flush the E$ and D$ here on TL=1 code to prevent getting nested
+ * Fast ECC traps in the TL=0 code.  If we get a Fast ECC event here in
+ * the TL=1 code, we will go to the Fast ECC at TL>0 handler which,
+ * since it is uses different code/data from this handler, has a better
+ * chance of fixing things up than simply recursing through this code
+ * again (this would probably cause an eventual kernel stack overflow).
+ * If the Fast ECC at TL>0 handler encounters a Fast ECC error before it
+ * can flush the E$ (or the error is a stuck-at bit), we will recurse in
+ * the Fast ECC at TL>0 handler and eventually Red Mode.
+ *
+ * Note that for Cheetah (and only Cheetah), we use alias addresses for
+ * flushing rather than ASI accesses (which don't exist on Cheetah).
+ * Should we encounter a Fast ECC error within this handler on Cheetah,
+ * there's a good chance it's within the ecache_flushaddr buffer (since
+ * it's the largest piece of memory we touch in the handler and it is
+ * usually kernel text/data).  For that reason the Fast ECC at TL>0
+ * handler for Cheetah uses an alternate buffer: ecache_tl1_flushaddr.
+ */
+
+/*
+ * Cheetah ecc-protected E$ trap (Trap 70) at TL=0
+ * tt0_fecc is replaced by fecc_err_instr in cpu_init_trap of the various
+ * architecture-specific files.  
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+
+#if defined(lint)
+
+void
+fecc_err_instr(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(fecc_err_instr)
+	membar	#Sync			! Cheetah requires membar #Sync
+
+	/*
+	 * Save current DCU state.  Turn off the Dcache and Icache.
+	 */
+	ldxa	[%g0]ASI_DCU, %g1	! save DCU in %g1
+	andn	%g1, DCU_DC + DCU_IC, %g4
+	stxa	%g4, [%g0]ASI_DCU
+	flush	%g0	/* flush required after changing the IC bit */
+
+	ASM_JMP(%g4, fast_ecc_err)
+	SET_SIZE(fecc_err_instr)
+
+#endif	/* lint */
+
+
+#if !(defined(JALAPENO) || defined(SERRANO))
+
+#if defined(lint)
+
+void
+fast_ecc_err(void)
+{}
+
+#else	/* lint */
+
+	.section ".text"
+	.align	64
+	ENTRY_NP(fast_ecc_err)
+
+	/*
+	 * Turn off CEEN and NCEEN.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g3
+	andn	%g3, EN_REG_NCEEN + EN_REG_CEEN, %g4
+	stxa	%g4, [%g0]ASI_ESTATE_ERR
+	membar	#Sync			! membar sync required
+
+	/*
+	 * Check to see whether we need to park our sibling core
+	 * before recording diagnostic information from caches
+	 * which may be shared by both cores.
+	 * We use %g1 to store information about whether or not
+	 * we had to park the core (%g1 holds our DCUCR value and
+	 * we only use bits from that register which are "reserved"
+	 * to keep track of core parking) so that we know whether
+	 * or not to unpark later. %g5 and %g4 are scratch registers.
+	 */
+	PARK_SIBLING_CORE(%g1, %g5, %g4)
+
+	/*
+	 * Do the CPU log out capture.
+	 *   %g3 = "failed?" return value.
+	 *   %g2 = Input = AFAR. Output the clo_flags info which is passed
+	 *         into this macro via %g4. Output only valid if cpu_private
+	 *         struct has not been initialized.
+	 *   CHPR_FECCTL0_LOGOUT = cpu logout structure offset input
+	 *   %g4 = Trap information stored in the cpu logout flags field
+	 *   %g5 = scr1
+	 *   %g6 = scr2
+	 *   %g3 = scr3
+	 *   %g4 = scr4
+	 */
+	and	%g3, EN_REG_CEEN, %g4		! store the CEEN value, TL=0
+	set	CHPR_FECCTL0_LOGOUT, %g6
+	DO_CPU_LOGOUT(%g3, %g2, %g6, %g4, %g5, %g6, %g3, %g4)
+
+	/*
+	 * Flush the Ecache (and L2 cache for Panther) to get the error out
+	 * of the Ecache.  If the UCC or UCU is on a dirty line, then the
+	 * following flush will turn that into a WDC or WDU, respectively.
+	 */
+	PN_L2_FLUSHALL(%g4, %g5, %g6)
+
+	CPU_INDEX(%g4, %g5)
+	mulx	%g4, CPU_NODE_SIZE, %g4
+	set	cpunodes, %g5
+	add	%g4, %g5, %g4
+	ld	[%g4 + ECACHE_LINESIZE], %g5
+	ld	[%g4 + ECACHE_SIZE], %g4
+
+	ASM_LDX(%g6, ecache_flushaddr)
+	ECACHE_FLUSHALL(%g4, %g5, %g6, %g7)
+
+	/*
+	 * Flush the Dcache.  Since bad data could have been installed in
+	 * the Dcache we must flush it before re-enabling it.
+	 */
+	ASM_LD(%g5, dcache_size)
+	ASM_LD(%g6, dcache_linesize)
+	CH_DCACHE_FLUSHALL(%g5, %g6, %g7)
+
+	/*
+	 * Flush the Icache.  Since we turned off the Icache to capture the
+	 * Icache line it is now stale or corrupted and we must flush it
+	 * before re-enabling it.
+	 */
+	GET_CPU_PRIVATE_PTR(%g0, %g5, %g7, fast_ecc_err_5);
+	ld	[%g5 + CHPR_ICACHE_LINESIZE], %g6
+	ba,pt	%icc, 6f
+	  ld	[%g5 + CHPR_ICACHE_SIZE], %g5
+fast_ecc_err_5:
+	ASM_LD(%g5, icache_size)
+	ASM_LD(%g6, icache_linesize)
+6:
+	CH_ICACHE_FLUSHALL(%g5, %g6, %g7, %g4)
+
+	/*
+	 * check to see whether we parked our sibling core at the start
+	 * of this handler. If so, we need to unpark it here.
+	 * We use DCUCR reserved bits (stored in %g1) to keep track of
+	 * whether or not we need to unpark. %g5 and %g4 are scratch registers.
+	 */
+	UNPARK_SIBLING_CORE(%g1, %g5, %g4)
+
+	/*
+	 * Restore the Dcache and Icache to the previous state.
+	 */
+	stxa	%g1, [%g0]ASI_DCU
+	flush	%g0	/* flush required after changing the IC bit */
+
+	/*
+	 * Make sure our CPU logout operation was successful.
+	 */
+	cmp	%g3, %g0
+	be	8f
+	  nop
+
+	/*
+	 * If the logout structure had been busy, how many times have
+	 * we tried to use it and failed (nesting count)? If we have
+	 * already recursed a substantial number of times, then we can
+	 * assume things are not going to get better by themselves and
+	 * so it would be best to panic.
+	 */
+	cmp	%g3, CLO_NESTING_MAX
+	blt	7f
+	  nop
+
+        call ptl1_panic
+          mov   PTL1_BAD_ECC, %g1
+
+7:	
+	/*
+	 * Otherwise, if the logout structure was busy but we have not
+	 * nested more times than our maximum value, then we simply
+	 * issue a retry. Our TL=0 trap handler code will check and
+	 * clear the AFSR after it is done logging what is currently
+	 * in the logout struct and handle this event at that time.
+	 */
+	retry
+8:
+	/*
+	 * Call cpu_fast_ecc_error via systrap at PIL 14 unless we're
+	 * already at PIL 15.
+	 */
+	set	cpu_fast_ecc_error, %g1
+	rdpr	%pil, %g4
+	cmp	%g4, PIL_14
+	ba	sys_trap
+	  movl	%icc, PIL_14, %g4
+
+	SET_SIZE(fast_ecc_err)
+
+#endif	/* lint */
+
+#endif	/* !(JALAPENO || SERRANO) */
+
+
+/*
+ * Cheetah/Cheetah+ Fast ECC at TL>0 trap strategy:
+ *
+ * The basic flow of this trap handler is as follows:
+ *
+ * 1) In the "trap 70" trap table code (fecc_err_tl1_instr), generate a
+ *    software trap 0 ("ta 0") to buy an extra set of %tpc, etc. which we
+ *    will use to save %g1 and %g2.
+ * 2) At the software trap 0 at TL>0 trap table code (fecc_err_tl1_cont_instr),
+ *    we save %g1+%g2 using %tpc, %tnpc + %tstate and jump to the fast ecc
+ *    handler (using the just saved %g1).
+ * 3) Turn off the Dcache if it was on and save the state of the Dcache
+ *    (whether on or off) in Bit2 (CH_ERR_TSTATE_DC_ON) of %tstate.
+ *    NB: we don't turn off the Icache because bad data is not installed nor
+ *        will we be doing any diagnostic accesses.
+ * 4) compute physical address of the per-cpu/per-tl save area using %g1+%g2
+ * 5) Save %g1-%g7 into the per-cpu/per-tl save area (%g1 + %g2 from the
+ *    %tpc, %tnpc, %tstate values previously saved).
+ * 6) set %tl to %tl - 1.
+ * 7) Save the appropriate flags and TPC in the ch_err_tl1_data structure.
+ * 8) Save the value of CH_ERR_TSTATE_DC_ON in the ch_err_tl1_tmp field.
+ * 9) For Cheetah and Jalapeno, read the AFAR and AFSR and clear.  For
+ *    Cheetah+ (and later), read the shadow AFAR and AFSR but don't clear.
+ *    Save the values in ch_err_tl1_data.  For Panther, read the shadow
+ *    AFSR_EXT and save the value in ch_err_tl1_data.
+ * 10) Disable CEEN/NCEEN to prevent any disrupting/deferred errors from
+ *    being queued.  We'll report them via the AFSR/AFAR capture in step 13.
+ * 11) Flush the Ecache.
+ *    NB: the Ecache is flushed assuming the largest possible size with
+ *        the smallest possible line size since access to the cpu_nodes may
+ *        cause an unrecoverable DTLB miss.
+ * 12) Reenable CEEN/NCEEN with the value saved from step 10.
+ * 13) For Cheetah and Jalapeno, read the AFAR and AFSR and clear again.
+ *    For Cheetah+ (and later), read the primary AFAR and AFSR and now clear.
+ *    Save the read AFSR/AFAR values in ch_err_tl1_data.  For Panther,
+ *    read and clear the primary AFSR_EXT and save it in ch_err_tl1_data.
+ * 14) Flush and re-enable the Dcache if it was on at step 3.
+ * 15) Do TRAPTRACE if enabled.
+ * 16) Check if a UCU->WDU (or L3_UCU->WDU for Panther) happened, panic if so.
+ * 17) Set the event pending flag in ch_err_tl1_pending[CPU]
+ * 18) Cause a softint 15.  The pil15_interrupt handler will inspect the
+ *    event pending flag and call cpu_tl1_error via systrap if set.
+ * 19) Restore the registers from step 5 and issue retry.
+ */
+
+/*
+ * Cheetah ecc-protected E$ trap (Trap 70) at TL>0
+ * tt1_fecc is replaced by fecc_err_tl1_instr in cpu_init_trap of the various
+ * architecture-specific files.  This generates a "Software Trap 0" at TL>0,
+ * which goes to fecc_err_tl1_cont_instr, and we continue the handling there.
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+
+#if defined(lint)
+
+void
+fecc_err_tl1_instr(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(fecc_err_tl1_instr)
+	CH_ERR_TL1_TRAPENTRY(SWTRAP_0);
+	SET_SIZE(fecc_err_tl1_instr)
+
+#endif	/* lint */
+
+/*
+ * Software trap 0 at TL>0.
+ * tt1_swtrap0 is replaced by fecc_err_tl1_cont_instr in cpu_init_trap of
+ * the various architecture-specific files.  This is used as a continuation
+ * of the fast ecc handling where we've bought an extra TL level, so we can
+ * use %tpc, %tnpc, %tstate to temporarily save the value of registers %g1
+ * and %g2.  Note that %tstate has bits 0-2 and then bits 8-19 as r/w,
+ * there's a reserved hole from 3-7.  We only use bits 0-1 and 8-9 (the low
+ * order two bits from %g1 and %g2 respectively).
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+#if defined(lint)
+
+void
+fecc_err_tl1_cont_instr(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(fecc_err_tl1_cont_instr)
+	CH_ERR_TL1_SWTRAPENTRY(fast_ecc_tl1_err)
+	SET_SIZE(fecc_err_tl1_cont_instr)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+void
+ce_err(void)
+{}
+
+#else	/* lint */
+
+/*
+ * The ce_err function handles disrupting trap type 0x63 at TL=0.
+ *
+ * AFSR errors bits which cause this trap are:
+ *	CE, EMC, EDU:ST, EDC, WDU, WDC, CPU, CPC, IVU, IVC
+ *
+ * NCEEN Bit of Cheetah External Cache Error Enable Register enables
+ * the following AFSR disrupting traps: EDU:ST, WDU, CPU, IVU
+ *
+ * CEEN Bit of Cheetah External Cache Error Enable Register enables
+ * the following AFSR disrupting traps: CE, EMC, EDC, WDC, CPC, IVC
+ *
+ * Cheetah+ also handles (No additional processing required):
+ *    DUE, DTO, DBERR	(NCEEN controlled)
+ *    THCE		(CEEN and ET_ECC_en controlled)
+ *    TUE		(ET_ECC_en controlled)
+ *
+ * Panther further adds:
+ *    IMU, L3_EDU, L3_WDU, L3_CPU		(NCEEN controlled)
+ *    IMC, L3_EDC, L3_WDC, L3_CPC, L3_THCE	(CEEN controlled)
+ *    TUE_SH, TUE		(NCEEN and L2_tag_ECC_en controlled)
+ *    L3_TUE, L3_TUE_SH		(NCEEN and ET_ECC_en controlled)
+ *    THCE			(CEEN and L2_tag_ECC_en controlled)
+ *    L3_THCE			(CEEN and ET_ECC_en controlled)
+ *
+ * Steps:
+ *	1. Disable hardware corrected disrupting errors only (CEEN)
+ *	2. Park sibling core if caches are shared (to avoid race
+ *	   condition while accessing shared resources such as L3
+ *	   data staging register during CPU logout.
+ *	3. If the CPU logout structure is not currently being used:
+ *		4. Clear AFSR error bits
+ *		5. Capture Ecache, Dcache and Icache lines associated
+ *		   with AFAR.
+ *		6. Unpark sibling core if we parked it earlier.
+ *		7. call cpu_disrupting_error via sys_trap at PIL 14
+ *		   unless we're already running at PIL 15.
+ *	4. Otherwise, if the CPU logout structure is busy:
+ *		5. Incriment "logout busy count" and place into %g3
+ *		6. Unpark sibling core if we parked it earlier.
+ *		7. Issue a retry since the other CPU error logging
+ *		   code will end up finding this error bit and logging
+ *		   information about it later.
+ *	5. Alternatively (to 3 and 4 above), if the cpu_private struct is
+ *         not yet initialized such that we can't even check the logout
+ *         struct, then we place the clo_flags data into %g2
+ *         (sys_trap->have_win arg #1) and call cpu_disrupting_error via
+ *         systrap. The clo_flags parameter is used to determine information
+ *         such as TL, TT, CEEN settings, etc in the high level trap
+ *         handler since we don't have access to detailed logout information
+ *         in cases where the cpu_private struct is not yet initialized.
+ *
+ * %g3: [ logout busy count ] - arg #2
+ * %g2: [ clo_flags if cpu_private unavailable ] - sys_trap->have_win: arg #1
+ */
+
+	.align	128
+	ENTRY_NP(ce_err)
+	membar	#Sync			! Cheetah requires membar #Sync
+
+	/*
+	 * Disable trap on hardware corrected errors (CEEN) while at TL=0
+	 * to prevent recursion.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g1
+	bclr	EN_REG_CEEN, %g1
+	stxa	%g1, [%g0]ASI_ESTATE_ERR
+	membar	#Sync			! membar sync required
+
+	/*
+	 * Save current DCU state.  Turn off Icache to allow capture of
+	 * Icache data by DO_CPU_LOGOUT.
+	 */
+	ldxa	[%g0]ASI_DCU, %g1	! save DCU in %g1
+	andn	%g1, DCU_IC, %g4
+	stxa	%g4, [%g0]ASI_DCU
+	flush	%g0	/* flush required after changing the IC bit */
+
+	/*
+	 * Check to see whether we need to park our sibling core
+	 * before recording diagnostic information from caches
+	 * which may be shared by both cores.
+	 * We use %g1 to store information about whether or not
+	 * we had to park the core (%g1 holds our DCUCR value and
+	 * we only use bits from that register which are "reserved"
+	 * to keep track of core parking) so that we know whether
+	 * or not to unpark later. %g5 and %g4 are scratch registers.
+	 */
+	PARK_SIBLING_CORE(%g1, %g5, %g4)
+
+	/*
+	 * Do the CPU log out capture.
+	 *   %g3 = "failed?" return value.
+	 *   %g2 = Input = AFAR. Output the clo_flags info which is passed
+	 *         into this macro via %g4. Output only valid if cpu_private
+	 *         struct has not been initialized.
+	 *   CHPR_CECC_LOGOUT = cpu logout structure offset input
+	 *   %g4 = Trap information stored in the cpu logout flags field
+	 *   %g5 = scr1
+	 *   %g6 = scr2
+	 *   %g3 = scr3
+	 *   %g4 = scr4
+	 */
+	clr	%g4			! TL=0 bit in afsr
+	set	CHPR_CECC_LOGOUT, %g6
+	DO_CPU_LOGOUT(%g3, %g2, %g6, %g4, %g5, %g6, %g3, %g4)
+
+	/*
+	 * Flush the Icache.  Since we turned off the Icache to capture the
+	 * Icache line it is now stale or corrupted and we must flush it
+	 * before re-enabling it.
+	 */
+	GET_CPU_PRIVATE_PTR(%g0, %g5, %g7, ce_err_1);
+	ld	[%g5 + CHPR_ICACHE_LINESIZE], %g6
+	ba,pt	%icc, 2f
+	  ld	[%g5 + CHPR_ICACHE_SIZE], %g5
+ce_err_1:
+	ASM_LD(%g5, icache_size)
+	ASM_LD(%g6, icache_linesize)
+2:
+	CH_ICACHE_FLUSHALL(%g5, %g6, %g7, %g4)
+
+	/*
+	 * check to see whether we parked our sibling core at the start
+	 * of this handler. If so, we need to unpark it here.
+	 * We use DCUCR reserved bits (stored in %g1) to keep track of
+	 * whether or not we need to unpark. %g5 and %g4 are scratch registers.
+	 */
+	UNPARK_SIBLING_CORE(%g1, %g5, %g4)
+
+	/*
+	 * Restore Icache to previous state.
+	 */
+	stxa	%g1, [%g0]ASI_DCU
+	flush	%g0	/* flush required after changing the IC bit */
+	
+	/*
+	 * Make sure our CPU logout operation was successful.
+	 */
+	cmp	%g3, %g0
+	be	4f
+	  nop
+
+	/*
+	 * If the logout structure had been busy, how many times have
+	 * we tried to use it and failed (nesting count)? If we have
+	 * already recursed a substantial number of times, then we can
+	 * assume things are not going to get better by themselves and
+	 * so it would be best to panic.
+	 */
+	cmp	%g3, CLO_NESTING_MAX
+	blt	3f
+	  nop
+
+        call ptl1_panic
+          mov   PTL1_BAD_ECC, %g1
+
+3:
+	/*
+	 * Otherwise, if the logout structure was busy but we have not
+	 * nested more times than our maximum value, then we simply
+	 * issue a retry. Our TL=0 trap handler code will check and
+	 * clear the AFSR after it is done logging what is currently
+	 * in the logout struct and handle this event at that time.
+	 */
+	retry
+4:
+	/*
+	 * Call cpu_disrupting_error via systrap at PIL 14 unless we're
+	 * already at PIL 15.
+	 */
+	set	cpu_disrupting_error, %g1
+	rdpr	%pil, %g4
+	cmp	%g4, PIL_14
+	ba	sys_trap
+	  movl	%icc, PIL_14, %g4
+	SET_SIZE(ce_err)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/*
+ * This trap cannot happen at TL>0 which means this routine will never
+ * actually be called and so we treat this like a BAD TRAP panic.
+ */
+void
+ce_err_tl1(void)
+{}
+
+#else	/* lint */
+
+	.align	64
+	ENTRY_NP(ce_err_tl1)
+
+        call ptl1_panic
+          mov   PTL1_BAD_TRAP, %g1
+
+	SET_SIZE(ce_err_tl1)
+
+#endif	/* lint */
+
+	
+#if defined(lint)
+
+void
+async_err(void)
+{}
+
+#else	/* lint */
+
+/*
+ * The async_err function handles deferred trap types 0xA 
+ * (instruction_access_error) and 0x32 (data_access_error) at TL>=0.
+ *
+ * AFSR errors bits which cause this trap are:
+ *	UE, EMU, EDU:BLD, L3_EDU:BLD, TO, BERR
+ * On some platforms, EMU may causes cheetah to pull the error pin
+ * never giving Solaris a chance to take a trap.
+ *
+ * NCEEN Bit of Cheetah External Cache Error Enable Register enables
+ * the following AFSR deferred traps: UE, EMU, EDU:BLD, TO, BERR
+ *
+ * Steps:
+ *	1. Disable CEEN and NCEEN errors to prevent recursive errors.
+ *	2. Turn D$ off per Cheetah PRM P.5 Note 6, turn I$ off to capture
+ *         I$ line in DO_CPU_LOGOUT.
+ *	3. Park sibling core if caches are shared (to avoid race
+ *	   condition while accessing shared resources such as L3
+ *	   data staging register during CPU logout.
+ *	4. If the CPU logout structure is not currently being used:
+ *		5. Clear AFSR error bits
+ *		6. Capture Ecache, Dcache and Icache lines associated
+ *		   with AFAR.
+ *		7. Unpark sibling core if we parked it earlier.
+ *		8. call cpu_deferred_error via sys_trap.
+ *	5. Otherwise, if the CPU logout structure is busy:
+ *		6. Incriment "logout busy count"
+ *		7. Unpark sibling core if we parked it earlier.
+ *		8) Issue a retry since the other CPU error logging
+ *		   code will end up finding this error bit and logging
+ *		   information about it later.
+ *      6. Alternatively (to 4 and 5 above), if the cpu_private struct is
+ *         not yet initialized such that we can't even check the logout
+ *         struct, then we place the clo_flags data into %g2
+ *         (sys_trap->have_win arg #1) and call cpu_deferred_error via
+ *         systrap. The clo_flags parameter is used to determine information
+ *         such as TL, TT, CEEN settings, etc in the high level trap handler
+ *         since we don't have access to detailed logout information in cases
+ *         where the cpu_private struct is not yet initialized.
+ *
+ * %g2: [ clo_flags if cpu_private unavailable ] - sys_trap->have_win: arg #1
+ * %g3: [ logout busy count ] - arg #2
+ */
+
+	ENTRY_NP(async_err)
+	membar	#Sync			! Cheetah requires membar #Sync
+
+	/*
+	 * Disable CEEN and NCEEN.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g3
+	andn	%g3, EN_REG_NCEEN + EN_REG_CEEN, %g4
+	stxa	%g4, [%g0]ASI_ESTATE_ERR
+	membar	#Sync			! membar sync required
+
+	/*
+	 * Save current DCU state.
+	 * Disable Icache to allow capture of Icache data by DO_CPU_LOGOUT.
+	 * Do this regardless of whether this is a Data Access Error or
+	 * Instruction Access Error Trap.
+	 * Disable Dcache for both Data Access Error and Instruction Access
+	 * Error per Cheetah PRM P.5 Note 6.
+	 */
+	ldxa	[%g0]ASI_DCU, %g1	! save DCU in %g1
+	andn	%g1, DCU_IC + DCU_DC, %g4
+	stxa	%g4, [%g0]ASI_DCU
+	flush	%g0	/* flush required after changing the IC bit */
+
+	/*
+	 * Check to see whether we need to park our sibling core
+	 * before recording diagnostic information from caches
+	 * which may be shared by both cores.
+	 * We use %g1 to store information about whether or not
+	 * we had to park the core (%g1 holds our DCUCR value and
+	 * we only use bits from that register which are "reserved"
+	 * to keep track of core parking) so that we know whether
+	 * or not to unpark later. %g6 and %g4 are scratch registers.
+	 */
+	PARK_SIBLING_CORE(%g1, %g6, %g4)
+
+	/*
+	 * Do the CPU logout capture.
+	 *
+	 *   %g3 = "failed?" return value.
+	 *   %g2 = Input = AFAR. Output the clo_flags info which is passed
+	 *         into this macro via %g4. Output only valid if cpu_private
+	 *         struct has not been initialized.
+	 *   CHPR_ASYNC_LOGOUT = cpu logout structure offset input
+	 *   %g4 = Trap information stored in the cpu logout flags field
+	 *   %g5 = scr1
+	 *   %g6 = scr2
+	 *   %g3 = scr3
+	 *   %g4 = scr4
+	 */
+	andcc	%g5, T_TL1, %g0	
+	clr	%g6	
+	movnz	%xcc, 1, %g6			! set %g6 if T_TL1 set
+	sllx	%g6, CLO_FLAGS_TL_SHIFT, %g6
+	sllx	%g5, CLO_FLAGS_TT_SHIFT, %g4
+	set	CLO_FLAGS_TT_MASK, %g2
+	and	%g4, %g2, %g4			! ttype
+	or	%g6, %g4, %g4			! TT and TL
+	and	%g3, EN_REG_CEEN, %g3		! CEEN value
+	or	%g3, %g4, %g4			! TT and TL and CEEN
+	set	CHPR_ASYNC_LOGOUT, %g6
+	DO_CPU_LOGOUT(%g3, %g2, %g6, %g4, %g5, %g6, %g3, %g4)
+
+	/*
+	 * If the logout struct was busy, we may need to pass the
+	 * TT, TL, and CEEN information to the TL=0 handler via 
+	 * systrap parameter so save it off here.
+	 */
+	cmp	%g3, %g0
+	be	1f
+	  nop
+	sllx	%g4, 32, %g4
+	or	%g4, %g3, %g3
+1:
+	/*
+	 * Flush the Icache.  Since we turned off the Icache to capture the
+	 * Icache line it is now stale or corrupted and we must flush it
+	 * before re-enabling it.
+	 */
+	GET_CPU_PRIVATE_PTR(%g0, %g5, %g7, async_err_1);
+	ld	[%g5 + CHPR_ICACHE_LINESIZE], %g6
+	ba,pt	%icc, 2f
+	  ld	[%g5 + CHPR_ICACHE_SIZE], %g5
+async_err_1:
+	ASM_LD(%g5, icache_size)
+	ASM_LD(%g6, icache_linesize)
+2:
+	CH_ICACHE_FLUSHALL(%g5, %g6, %g7, %g4)
+
+	/*
+	 * XXX - Don't we need to flush the Dcache before turning it back
+	 *       on to avoid stale or corrupt data? Was this broken?
+	 */
+	/*
+	 * Flush the Dcache before turning it back on since it may now
+	 * contain stale or corrupt data.
+	 */
+	ASM_LD(%g5, dcache_size)
+	ASM_LD(%g6, dcache_linesize)
+	CH_DCACHE_FLUSHALL(%g5, %g6, %g7)
+
+	/*
+	 * check to see whether we parked our sibling core at the start
+	 * of this handler. If so, we need to unpark it here.
+	 * We use DCUCR reserved bits (stored in %g1) to keep track of
+	 * whether or not we need to unpark. %g5 and %g7 are scratch registers.
+	 */
+	UNPARK_SIBLING_CORE(%g1, %g5, %g7)
+
+	/*
+	 * Restore Icache and Dcache to previous state.
+	 */
+	stxa	%g1, [%g0]ASI_DCU
+	flush	%g0	/* flush required after changing the IC bit */
+	
+	/*
+	 * Make sure our CPU logout operation was successful.
+	 */
+	cmp	%g3, %g0
+	be	4f
+	  nop
+
+	/*
+	 * If the logout structure had been busy, how many times have
+	 * we tried to use it and failed (nesting count)? If we have
+	 * already recursed a substantial number of times, then we can
+	 * assume things are not going to get better by themselves and
+	 * so it would be best to panic.
+	 */
+	cmp	%g3, CLO_NESTING_MAX
+	blt	3f
+	  nop
+
+        call ptl1_panic
+          mov   PTL1_BAD_ECC, %g1
+
+3:
+	/*
+	 * Otherwise, if the logout structure was busy but we have not
+	 * nested more times than our maximum value, then we simply
+	 * issue a retry. Our TL=0 trap handler code will check and
+	 * clear the AFSR after it is done logging what is currently
+	 * in the logout struct and handle this event at that time.
+	 */
+	retry
+4:
+	set	cpu_deferred_error, %g1
+	ba	sys_trap
+	  mov	PIL_15, %g4		! run at pil 15
+	SET_SIZE(async_err)
+
+#endif	/* lint */
+
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+
+/*
+ * D$ parity error trap (trap 71) at TL=0.
+ * tt0_dperr is replaced by dcache_parity_instr in cpu_init_trap of
+ * the various architecture-specific files.  This merely sets up the
+ * arguments for cpu_parity_error and calls it via sys_trap.
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+#if defined(lint)
+
+void
+dcache_parity_instr(void)
+{}
+
+#else	/* lint */
+	ENTRY_NP(dcache_parity_instr)
+	membar	#Sync			! Cheetah+ requires membar #Sync
+	set	cpu_parity_error, %g1
+	or	%g0, CH_ERR_DPE, %g2
+	rdpr	%tpc, %g3
+	sethi	%hi(sys_trap), %g7
+	jmp	%g7 + %lo(sys_trap)
+	  mov	PIL_15, %g4		! run at pil 15
+	SET_SIZE(dcache_parity_instr)
+
+#endif	/* lint */
+
+
+/*
+ * D$ parity error trap (trap 71) at TL>0.
+ * tt1_dperr is replaced by dcache_parity_tl1_instr in cpu_init_trap of
+ * the various architecture-specific files.  This generates a "Software
+ * Trap 1" at TL>0, which goes to dcache_parity_tl1_cont_instr, and we
+ * continue the handling there.
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+#if defined(lint)
+
+void
+dcache_parity_tl1_instr(void)
+{}
+
+#else	/* lint */
+	ENTRY_NP(dcache_parity_tl1_instr)
+	CH_ERR_TL1_TRAPENTRY(SWTRAP_1);
+	SET_SIZE(dcache_parity_tl1_instr)
+
+#endif	/* lint */
+
+
+/*
+ * Software trap 1 at TL>0.
+ * tt1_swtrap1 is replaced by dcache_parity_tl1_cont_instr in cpu_init_trap
+ * of the various architecture-specific files.  This is used as a continuation
+ * of the dcache parity handling where we've bought an extra TL level, so we
+ * can use %tpc, %tnpc, %tstate to temporarily save the value of registers %g1
+ * and %g2.  Note that %tstate has bits 0-2 and then bits 8-19 as r/w,
+ * there's a reserved hole from 3-7.  We only use bits 0-1 and 8-9 (the low
+ * order two bits from %g1 and %g2 respectively).
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+#if defined(lint)
+
+void
+dcache_parity_tl1_cont_instr(void)
+{}
+
+#else	/* lint */
+	ENTRY_NP(dcache_parity_tl1_cont_instr)
+	CH_ERR_TL1_SWTRAPENTRY(dcache_parity_tl1_err);
+	SET_SIZE(dcache_parity_tl1_cont_instr)
+
+#endif	/* lint */
+
+/*
+ * D$ parity error at TL>0 handler
+ * We get here via trap 71 at TL>0->Software trap 1 at TL>0.  We enter
+ * this routine with %g1 and %g2 already saved in %tpc, %tnpc and %tstate.
+ */
+#if defined(lint)
+
+void
+dcache_parity_tl1_err(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(dcache_parity_tl1_err)
+
+	/*
+	 * This macro saves all the %g registers in the ch_err_tl1_data
+	 * structure, updates the ch_err_tl1_flags and saves the %tpc in
+	 * ch_err_tl1_tpc.  At the end of this macro, %g1 will point to
+	 * the ch_err_tl1_data structure and %g2 will have the original
+	 * flags in the ch_err_tl1_data structure.  All %g registers
+	 * except for %g1 and %g2 will be available.
+	 */
+	CH_ERR_TL1_ENTER(CH_ERR_DPE);
+
+#ifdef TRAPTRACE
+	/*
+	 * Get current trap trace entry physical pointer.
+	 */
+	CPU_INDEX(%g6, %g5)
+	sll	%g6, TRAPTR_SIZE_SHIFT, %g6
+	set	trap_trace_ctl, %g5
+	add	%g6, %g5, %g6
+	ld	[%g6 + TRAPTR_LIMIT], %g5
+	tst	%g5
+	be	%icc, dpe_tl1_skip_tt
+	  nop
+	ldx	[%g6 + TRAPTR_PBASE], %g5
+	ld	[%g6 + TRAPTR_OFFSET], %g4
+	add	%g5, %g4, %g5
+
+	/*
+	 * Create trap trace entry.
+	 */
+	rd	%asi, %g7
+	wr	%g0, TRAPTR_ASI, %asi
+	rd	STICK, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TICK]%asi
+	rdpr	%tl, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TL]%asi
+	rdpr	%tt, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TT]%asi
+	rdpr	%tpc, %g4
+	stna	%g4, [%g5 + TRAP_ENT_TPC]%asi
+	rdpr	%tstate, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TSTATE]%asi
+	stna	%sp, [%g5 + TRAP_ENT_SP]%asi
+	stna	%g0, [%g5 + TRAP_ENT_TR]%asi
+	stna	%g0, [%g5 + TRAP_ENT_F1]%asi
+	stna	%g0, [%g5 + TRAP_ENT_F2]%asi
+	stna	%g0, [%g5 + TRAP_ENT_F3]%asi
+	stna	%g0, [%g5 + TRAP_ENT_F4]%asi
+	wr	%g0, %g7, %asi
+
+	/*
+	 * Advance trap trace pointer.
+	 */
+	ld	[%g6 + TRAPTR_OFFSET], %g5
+	ld	[%g6 + TRAPTR_LIMIT], %g4
+	st	%g5, [%g6 + TRAPTR_LAST_OFFSET]
+	add	%g5, TRAP_ENT_SIZE, %g5
+	sub	%g4, TRAP_ENT_SIZE, %g4
+	cmp	%g5, %g4
+	movge	%icc, 0, %g5
+	st	%g5, [%g6 + TRAPTR_OFFSET]
+dpe_tl1_skip_tt:
+#endif	/* TRAPTRACE */
+
+	/*
+	 * I$ and D$ are automatically turned off by HW when the CPU hits
+	 * a dcache or icache parity error so we will just leave those two
+	 * off for now to avoid repeating this trap.
+	 * For Panther, however, since we trap on P$ data parity errors
+	 * and HW does not automatically disable P$, we need to disable it
+	 * here so that we don't encounter any recursive traps when we
+	 * issue the retry.
+	 */
+	ldxa	[%g0]ASI_DCU, %g3
+	mov	1, %g4
+	sllx	%g4, DCU_PE_SHIFT, %g4
+	andn	%g3, %g4, %g3
+	stxa	%g3, [%g0]ASI_DCU
+	membar	#Sync
+
+	/*
+	 * We fall into this macro if we've successfully logged the error in
+	 * the ch_err_tl1_data structure and want the PIL15 softint to pick
+	 * it up and log it.  %g1 must point to the ch_err_tl1_data structure.
+	 * Restores the %g registers and issues retry.
+	 */
+	CH_ERR_TL1_EXIT;
+	SET_SIZE(dcache_parity_tl1_err)
+
+#endif	/* lint */
+
+/*
+ * I$ parity error trap (trap 72) at TL=0.
+ * tt0_iperr is replaced by icache_parity_instr in cpu_init_trap of
+ * the various architecture-specific files.  This merely sets up the
+ * arguments for cpu_parity_error and calls it via sys_trap.
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+#if defined(lint)
+
+void
+icache_parity_instr(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(icache_parity_instr)
+	membar	#Sync			! Cheetah+ requires membar #Sync
+	set	cpu_parity_error, %g1
+	or	%g0, CH_ERR_IPE, %g2
+	rdpr	%tpc, %g3
+	sethi	%hi(sys_trap), %g7
+	jmp	%g7 + %lo(sys_trap)
+	  mov	PIL_15, %g4		! run at pil 15
+	SET_SIZE(icache_parity_instr)
+
+#endif	/* lint */
+
+/*
+ * I$ parity error trap (trap 72) at TL>0.
+ * tt1_iperr is replaced by icache_parity_tl1_instr in cpu_init_trap of
+ * the various architecture-specific files.  This generates a "Software
+ * Trap 2" at TL>0, which goes to icache_parity_tl1_cont_instr, and we
+ * continue the handling there.
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+#if defined(lint)
+
+void
+icache_parity_tl1_instr(void)
+{}
+
+#else	/* lint */
+	ENTRY_NP(icache_parity_tl1_instr)
+	CH_ERR_TL1_TRAPENTRY(SWTRAP_2);
+	SET_SIZE(icache_parity_tl1_instr)
+
+#endif	/* lint */
+
+/*
+ * Software trap 2 at TL>0.
+ * tt1_swtrap2 is replaced by icache_parity_tl1_cont_instr in cpu_init_trap
+ * of the various architecture-specific files.  This is used as a continuation
+ * of the icache parity handling where we've bought an extra TL level, so we
+ * can use %tpc, %tnpc, %tstate to temporarily save the value of registers %g1
+ * and %g2.  Note that %tstate has bits 0-2 and then bits 8-19 as r/w,
+ * there's a reserved hole from 3-7.  We only use bits 0-1 and 8-9 (the low
+ * order two bits from %g1 and %g2 respectively).
+ * NB: Must be 8 instructions or less to fit in trap table and code must
+ *     be relocatable.
+ */
+#if defined(lint)
+
+void
+icache_parity_tl1_cont_instr(void)
+{}
+
+#else	/* lint */
+	ENTRY_NP(icache_parity_tl1_cont_instr)
+	CH_ERR_TL1_SWTRAPENTRY(icache_parity_tl1_err);
+	SET_SIZE(icache_parity_tl1_cont_instr)
+
+#endif	/* lint */
+
+
+/*
+ * I$ parity error at TL>0 handler
+ * We get here via trap 72 at TL>0->Software trap 2 at TL>0.  We enter
+ * this routine with %g1 and %g2 already saved in %tpc, %tnpc and %tstate.
+ */
+#if defined(lint)
+
+void
+icache_parity_tl1_err(void)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(icache_parity_tl1_err)
+
+	/*
+	 * This macro saves all the %g registers in the ch_err_tl1_data
+	 * structure, updates the ch_err_tl1_flags and saves the %tpc in
+	 * ch_err_tl1_tpc.  At the end of this macro, %g1 will point to
+	 * the ch_err_tl1_data structure and %g2 will have the original
+	 * flags in the ch_err_tl1_data structure.  All %g registers
+	 * except for %g1 and %g2 will be available.
+	 */
+	CH_ERR_TL1_ENTER(CH_ERR_IPE);
+
+#ifdef TRAPTRACE
+	/*
+	 * Get current trap trace entry physical pointer.
+	 */
+	CPU_INDEX(%g6, %g5)
+	sll	%g6, TRAPTR_SIZE_SHIFT, %g6
+	set	trap_trace_ctl, %g5
+	add	%g6, %g5, %g6
+	ld	[%g6 + TRAPTR_LIMIT], %g5
+	tst	%g5
+	be	%icc, ipe_tl1_skip_tt
+	  nop
+	ldx	[%g6 + TRAPTR_PBASE], %g5
+	ld	[%g6 + TRAPTR_OFFSET], %g4
+	add	%g5, %g4, %g5
+
+	/*
+	 * Create trap trace entry.
+	 */
+	rd	%asi, %g7
+	wr	%g0, TRAPTR_ASI, %asi
+	rd	STICK, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TICK]%asi
+	rdpr	%tl, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TL]%asi
+	rdpr	%tt, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TT]%asi
+	rdpr	%tpc, %g4
+	stna	%g4, [%g5 + TRAP_ENT_TPC]%asi
+	rdpr	%tstate, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TSTATE]%asi
+	stna	%sp, [%g5 + TRAP_ENT_SP]%asi
+	stna	%g0, [%g5 + TRAP_ENT_TR]%asi
+	stna	%g0, [%g5 + TRAP_ENT_F1]%asi
+	stna	%g0, [%g5 + TRAP_ENT_F2]%asi
+	stna	%g0, [%g5 + TRAP_ENT_F3]%asi
+	stna	%g0, [%g5 + TRAP_ENT_F4]%asi
+	wr	%g0, %g7, %asi
+
+	/*
+	 * Advance trap trace pointer.
+	 */
+	ld	[%g6 + TRAPTR_OFFSET], %g5
+	ld	[%g6 + TRAPTR_LIMIT], %g4
+	st	%g5, [%g6 + TRAPTR_LAST_OFFSET]
+	add	%g5, TRAP_ENT_SIZE, %g5
+	sub	%g4, TRAP_ENT_SIZE, %g4
+	cmp	%g5, %g4
+	movge	%icc, 0, %g5
+	st	%g5, [%g6 + TRAPTR_OFFSET]
+ipe_tl1_skip_tt:
+#endif	/* TRAPTRACE */
+
+	/*
+	 * We fall into this macro if we've successfully logged the error in
+	 * the ch_err_tl1_data structure and want the PIL15 softint to pick
+	 * it up and log it.  %g1 must point to the ch_err_tl1_data structure.
+	 * Restores the %g registers and issues retry.
+	 */
+	CH_ERR_TL1_EXIT;
+
+	SET_SIZE(icache_parity_tl1_err)
+
+#endif	/* lint */
+
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+
+
+/*
+ * The itlb_rd_entry and dtlb_rd_entry functions return the tag portion of the
+ * tte, the virtual address, and the ctxnum of the specified tlb entry.  They
+ * should only be used in places where you have no choice but to look at the
+ * tlb itself.
+ *
+ * Note: These two routines are required by the Estar "cpr" loadable module.
+ */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+itlb_rd_entry(uint_t entry, tte_t *tte, uint64_t *va_tag)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(itlb_rd_entry)
+	sllx	%o0, 3, %o0
+	ldxa	[%o0]ASI_ITLB_ACCESS, %g1
+	stx	%g1, [%o1]
+	ldxa	[%o0]ASI_ITLB_TAGREAD, %g2
+	set	TAGREAD_CTX_MASK, %o4
+	andn	%g2, %o4, %o5
+	retl
+	  stx	%o5, [%o2]
+	SET_SIZE(itlb_rd_entry)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+dtlb_rd_entry(uint_t entry, tte_t *tte, uint64_t *va_tag)
+{}
+
+#else	/* lint */
+
+	ENTRY_NP(dtlb_rd_entry)
+	sllx	%o0, 3, %o0
+	ldxa	[%o0]ASI_DTLB_ACCESS, %g1
+	stx	%g1, [%o1]
+	ldxa	[%o0]ASI_DTLB_TAGREAD, %g2
+	set	TAGREAD_CTX_MASK, %o4
+	andn	%g2, %o4, %o5
+	retl
+	  stx	%o5, [%o2]
+	SET_SIZE(dtlb_rd_entry)
+#endif	/* lint */
+
+
+#if !(defined(JALAPENO) || defined(SERRANO))
+
+#if defined(lint)
+
+uint64_t
+get_safari_config(void)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(get_safari_config)
+	ldxa	[%g0]ASI_SAFARI_CONFIG, %o0
+	retl
+	nop
+	SET_SIZE(get_safari_config)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+set_safari_config(uint64_t safari_config)
+{}
+
+#else	/* lint */
+
+	ENTRY(set_safari_config)
+	stxa	%o0, [%g0]ASI_SAFARI_CONFIG
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(set_safari_config)
+
+#endif	/* lint */
+
+#endif	/* !(JALAPENO || SERRANO) */
+
+
+#if defined(lint)
+
+void
+cpu_cleartickpnt(void)
+{}
+
+#else	/* lint */
+	/*
+	 * Clear the NPT (non-privileged trap) bit in the %tick/%stick
+	 * registers. In an effort to make the change in the
+	 * tick/stick counter as consistent as possible, we disable
+	 * all interrupts while we're changing the registers. We also
+	 * ensure that the read and write instructions are in the same
+	 * line in the instruction cache.
+	 */
+	ENTRY_NP(cpu_clearticknpt)
+	rdpr	%pstate, %g1		/* save processor state */
+	andn	%g1, PSTATE_IE, %g3	/* turn off */
+	wrpr	%g0, %g3, %pstate	/*   interrupts */
+	rdpr	%tick, %g2		/* get tick register */
+	brgez,pn %g2, 1f		/* if NPT bit off, we're done */
+	mov	1, %g3			/* create mask */
+	sllx	%g3, 63, %g3		/*   for NPT bit */
+	ba,a,pt	%xcc, 2f
+	.align	8			/* Ensure rd/wr in same i$ line */
+2:
+	rdpr	%tick, %g2		/* get tick register */
+	wrpr	%g3, %g2, %tick		/* write tick register, */
+					/*   clearing NPT bit   */
+1:
+	rd	STICK, %g2		/* get stick register */
+	brgez,pn %g2, 3f		/* if NPT bit off, we're done */
+	mov	1, %g3			/* create mask */
+	sllx	%g3, 63, %g3		/*   for NPT bit */
+	ba,a,pt	%xcc, 4f
+	.align	8			/* Ensure rd/wr in same i$ line */
+4:
+	rd	STICK, %g2		/* get stick register */
+	wr	%g3, %g2, STICK		/* write stick register, */
+					/*   clearing NPT bit   */
+3:
+	jmp	%g4 + 4
+	wrpr	%g0, %g1, %pstate	/* restore processor state */
+	
+	SET_SIZE(cpu_clearticknpt)
+
+#endif	/* lint */
+
+
+#if defined(CPU_IMP_L1_CACHE_PARITY)
+
+#if defined(lint)
+/*
+ * correct_dcache_parity(size_t size, size_t linesize)
+ *
+ * Correct D$ data parity by zeroing the data and initializing microtag
+ * for all indexes and all ways of the D$.
+ * 
+ */
+/* ARGSUSED */
+void
+correct_dcache_parity(size_t size, size_t linesize)
+{}
+
+#else	/* lint */
+
+	ENTRY(correct_dcache_parity)
+	/*
+	 * Register Usage:
+	 *
+	 * %o0 = input D$ size
+	 * %o1 = input D$ line size
+	 * %o2 = scratch
+	 * %o3 = scratch
+	 * %o4 = scratch
+	 */
+
+	sub	%o0, %o1, %o0			! init cache line address
+
+	/*
+	 * For Panther CPUs, we also need to clear the data parity bits
+	 * using DC_data_parity bit of the ASI_DCACHE_DATA register.
+	 */
+	GET_CPU_IMPL(%o3)
+	cmp	%o3, PANTHER_IMPL
+	bne	1f
+	  clr	%o3				! zero for non-Panther
+	mov	1, %o3
+	sll	%o3, PN_DC_DATA_PARITY_BIT_SHIFT, %o3
+
+1:
+	/*
+	 * Set utag = way since it must be unique within an index.
+	 */
+	srl	%o0, 14, %o2			! get cache way (DC_way)
+	membar	#Sync				! required before ASI_DC_UTAG
+	stxa	%o2, [%o0]ASI_DC_UTAG		! set D$ utag = cache way
+	membar	#Sync				! required after ASI_DC_UTAG
+
+	/*
+	 * Zero line of D$ data (and data parity bits for Panther)
+	 */
+	sub	%o1, 8, %o2
+	or	%o0, %o3, %o4			! same address + DC_data_parity
+2:
+	membar	#Sync				! required before ASI_DC_DATA
+	stxa	%g0, [%o0 + %o2]ASI_DC_DATA	! zero 8 bytes of D$ data
+	membar	#Sync				! required after ASI_DC_DATA
+	/*
+	 * We also clear the parity bits if this is a panther. For non-Panther
+	 * CPUs, we simply end up clearing the $data register twice.
+	 */
+	stxa	%g0, [%o4 + %o2]ASI_DC_DATA
+	membar	#Sync
+
+	subcc	%o2, 8, %o2
+	bge	2b
+	nop
+
+	subcc	%o0, %o1, %o0
+	bge	1b
+	nop
+
+	retl
+	  nop
+	SET_SIZE(correct_dcache_parity)
+
+#endif	/* lint */
+
+#endif	/* CPU_IMP_L1_CACHE_PARITY */
+
+
+#if defined(lint)
+/*
+ *  Get timestamp (stick).
+ */
+/* ARGSUSED */
+void
+stick_timestamp(int64_t *ts)
+{
+}
+
+#else	/* lint */
+
+	ENTRY_NP(stick_timestamp)
+	rd	STICK, %g1	! read stick reg
+	sllx	%g1, 1, %g1
+	srlx	%g1, 1, %g1	! clear npt bit
+
+	retl
+	stx     %g1, [%o0]	! store the timestamp
+	SET_SIZE(stick_timestamp)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * Set STICK adjusted by skew.
+ */
+/* ARGSUSED */	
+void
+stick_adj(int64_t skew)
+{
+}
+
+#else	/* lint */
+		
+	ENTRY_NP(stick_adj)
+	rdpr	%pstate, %g1		! save processor state
+	andn	%g1, PSTATE_IE, %g3
+	ba	1f			! cache align stick adj
+	wrpr	%g0, %g3, %pstate	! turn off interrupts
+
+	.align	16
+1:	nop
+
+	rd	STICK, %g4		! read stick reg
+	add	%g4, %o0, %o1		! adjust stick with skew
+	wr	%o1, %g0, STICK		! write stick reg
+
+	retl
+	wrpr	%g1, %pstate		! restore processor state
+	SET_SIZE(stick_adj)
+
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ * Debugger-specific stick retrieval
+ */
+/*ARGSUSED*/
+int
+kdi_get_stick(uint64_t *stickp)
+{
+	return (0);
+}
+
+#else	/* lint */
+
+	ENTRY_NP(kdi_get_stick)
+	rd	STICK, %g1
+	stx	%g1, [%o0]
+	retl
+	mov	%g0, %o0
+	SET_SIZE(kdi_get_stick)
+
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ * Invalidate the specified line from the D$.
+ *
+ * Register usage:
+ *	%o0 - index for the invalidation, specifies DC_way and DC_addr
+ *
+ * ASI_DC_TAG, 0x47, is used in the following manner. A 64-bit value is
+ * stored to a particular DC_way and DC_addr in ASI_DC_TAG.
+ *
+ * The format of the stored 64-bit value is:
+ *
+ *	+----------+--------+----------+
+ *	| Reserved | DC_tag | DC_valid |
+ *	+----------+--------+----------+
+ *       63      31 30     1	      0
+ *
+ * DC_tag is the 30-bit physical tag of the associated line.
+ * DC_valid is the 1-bit valid field for both the physical and snoop tags.
+ *
+ * The format of the 64-bit DC_way and DC_addr into ASI_DC_TAG is:
+ *
+ *	+----------+--------+----------+----------+
+ *	| Reserved | DC_way | DC_addr  | Reserved |
+ *	+----------+--------+----------+----------+
+ *       63      16 15    14 13       5 4        0
+ *
+ * DC_way is a 2-bit index that selects one of the 4 ways.
+ * DC_addr is a 9-bit index that selects one of 512 tag/valid fields.
+ *
+ * Setting the DC_valid bit to zero for the specified DC_way and
+ * DC_addr index into the D$ results in an invalidation of a D$ line.
+ */
+/*ARGSUSED*/
+void
+dcache_inval_line(int index)
+{
+}
+#else	/* lint */
+	ENTRY(dcache_inval_line)
+	sll	%o0, 5, %o0		! shift index into DC_way and DC_addr
+	stxa	%g0, [%o0]ASI_DC_TAG	! zero the DC_valid and DC_tag bits
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(dcache_inval_line)
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ * Invalidate the entire I$
+ *
+ * Register usage:
+ *	%o0 - specifies IC_way, IC_addr, IC_tag
+ *	%o1 - scratch
+ *	%o2 - used to save and restore DCU value
+ *	%o3 - scratch
+ *	%o5 - used to save and restore PSTATE
+ *
+ * Due to the behavior of the I$ control logic when accessing ASI_IC_TAG,
+ * the I$ should be turned off. Accesses to ASI_IC_TAG may collide and
+ * block out snoops and invalidates to the I$, causing I$ consistency
+ * to be broken. Before turning on the I$, all I$ lines must be invalidated.
+ *
+ * ASI_IC_TAG, 0x67, is used in the following manner. A 64-bit value is
+ * stored to a particular IC_way, IC_addr, IC_tag in ASI_IC_TAG. The
+ * info below describes store (write) use of ASI_IC_TAG. Note that read
+ * use of ASI_IC_TAG behaves differently.
+ *
+ * The format of the stored 64-bit value is:
+ *
+ *	+----------+--------+---------------+-----------+
+ *	| Reserved | Valid  | IC_vpred<7:0> | Undefined |
+ *	+----------+--------+---------------+-----------+
+ *       63      55    54    53           46 45        0
+ *
+ * Valid is the 1-bit valid field for both the physical and snoop tags.
+ * IC_vpred is the 8-bit LPB bits for 8 instructions starting at
+ *	the 32-byte boundary aligned address specified by IC_addr.
+ *
+ * The format of the 64-bit IC_way, IC_addr, IC_tag into ASI_IC_TAG is:
+ *
+ *	+----------+--------+---------+--------+---------+
+ *	| Reserved | IC_way | IC_addr | IC_tag |Reserved |
+ *	+----------+--------+---------+--------+---------+
+ *       63      16 15    14 13      5 4      3 2       0
+ *
+ * IC_way is a 2-bit index that selects one of the 4 ways.
+ * IC_addr[13:6] is an 8-bit index that selects one of 256 valid fields.
+ * IC_addr[5] is a "don't care" for a store.
+ * IC_tag set to 2 specifies that the stored value is to be interpreted
+ *	as containing Valid and IC_vpred as described above.
+ *
+ * Setting the Valid bit to zero for the specified IC_way and
+ * IC_addr index into the I$ results in an invalidation of an I$ line.
+ */
+/*ARGSUSED*/
+void
+icache_inval_all(void)
+{
+}
+#else	/* lint */
+	ENTRY(icache_inval_all)
+	rdpr	%pstate, %o5
+	andn	%o5, PSTATE_IE, %o3
+	wrpr	%g0, %o3, %pstate	! clear IE bit
+
+	GET_CPU_PRIVATE_PTR(%g0, %o0, %o2, icache_inval_all_1);
+	ld	[%o0 + CHPR_ICACHE_LINESIZE], %o1
+	ba,pt	%icc, 2f
+	  ld	[%o0 + CHPR_ICACHE_SIZE], %o0
+icache_inval_all_1:
+	ASM_LD(%o0, icache_size)
+	ASM_LD(%o1, icache_linesize)
+2:
+	CH_ICACHE_FLUSHALL(%o0, %o1, %o2, %o4)
+
+	retl
+	wrpr	%g0, %o5, %pstate	! restore earlier pstate
+	SET_SIZE(icache_inval_all)
+#endif	/* lint */
+
+
+#if defined(lint)
+/* ARGSUSED */
+void
+cache_scrubreq_tl1(uint64_t inum, uint64_t index)
+{
+}
+
+#else	/* lint */
+/*
+ * cache_scrubreq_tl1 is the crosstrap handler called on offlined cpus via a 
+ * crosstrap.  It atomically increments the outstanding request counter and,
+ * if there was not already an outstanding request, branches to setsoftint_tl1
+ * to enqueue an intr_req for the given inum.
+ */
+
+	! Register usage:
+	!
+	! Arguments:
+	! %g1 - inum
+	! %g2 - index into chsm_outstanding array
+	!
+	! Internal:
+	! %g2, %g3, %g5 - scratch
+	! %g4 - ptr. to scrub_misc chsm_outstanding[index].
+	! %g6 - setsoftint_tl1 address
+
+	ENTRY_NP(cache_scrubreq_tl1)
+	mulx	%g2, CHSM_OUTSTANDING_INCR, %g2
+	set	CHPR_SCRUB_MISC + CHSM_OUTSTANDING, %g3
+	add	%g2, %g3, %g2
+	GET_CPU_PRIVATE_PTR(%g2, %g4, %g5, 1f);
+	ld	[%g4], %g2		! cpu's chsm_outstanding[index]
+	!
+	! no need to use atomic instructions for the following
+	! increment - we're at tl1
+	!
+	add	%g2, 0x1, %g3
+	brnz,pn	%g2, 1f			! no need to enqueue more intr_req
+	  st	%g3, [%g4]		! delay - store incremented counter
+	ASM_JMP(%g6, setsoftint_tl1)
+	! not reached
+1:
+	retry
+	SET_SIZE(cache_scrubreq_tl1)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+get_cpu_error_state(ch_cpu_errors_t *cpu_error_regs)
+{}
+
+#else	/* lint */
+
+/*
+ * Get the error state for the processor.
+ * Note that this must not be used at TL>0
+ */
+	ENTRY(get_cpu_error_state)
+#if defined(CHEETAH_PLUS)
+	set	ASI_SHADOW_REG_VA, %o2
+	ldxa	[%o2]ASI_AFSR, %o1		! shadow afsr reg
+	stx	%o1, [%o0 + CH_CPU_ERRORS_SHADOW_AFSR]
+	ldxa	[%o2]ASI_AFAR, %o1		! shadow afar reg
+	stx	%o1, [%o0 + CH_CPU_ERRORS_SHADOW_AFAR]
+	GET_CPU_IMPL(%o3)	! Only panther has AFSR_EXT registers
+	cmp	%o3, PANTHER_IMPL
+	bne,a	1f
+	  stx	%g0, [%o0 + CH_CPU_ERRORS_AFSR_EXT]	! zero for non-PN
+	set	ASI_AFSR_EXT_VA, %o2
+	ldxa	[%o2]ASI_AFSR, %o1		! afsr_ext reg
+	stx	%o1, [%o0 + CH_CPU_ERRORS_AFSR_EXT]
+	set	ASI_SHADOW_AFSR_EXT_VA, %o2
+	ldxa	[%o2]ASI_AFSR, %o1		! shadow afsr_ext reg
+	stx	%o1, [%o0 + CH_CPU_ERRORS_SHADOW_AFSR_EXT]
+	b	2f
+	  nop
+1:
+	stx	%g0, [%o0 + CH_CPU_ERRORS_SHADOW_AFSR_EXT] ! zero for non-PN
+2:
+#else	/* CHEETAH_PLUS */
+	stx	%g0, [%o0 + CH_CPU_ERRORS_SHADOW_AFSR]
+	stx	%g0, [%o0 + CH_CPU_ERRORS_SHADOW_AFAR]
+	stx	%g0, [%o0 + CH_CPU_ERRORS_AFSR_EXT]
+	stx	%g0, [%o0 + CH_CPU_ERRORS_SHADOW_AFSR_EXT]
+#endif	/* CHEETAH_PLUS */
+#if defined(SERRANO)
+	/*
+	 * Serrano has an afar2 which captures the address on FRC/FRU errors.
+	 * We save this in the afar2 of the register save area.
+	 */
+	set	ASI_MCU_AFAR2_VA, %o2
+	ldxa	[%o2]ASI_MCU_CTRL, %o1
+	stx	%o1, [%o0 + CH_CPU_ERRORS_AFAR2]
+#endif	/* SERRANO */
+	ldxa	[%g0]ASI_AFSR, %o1		! primary afsr reg
+	stx	%o1, [%o0 + CH_CPU_ERRORS_AFSR]
+	ldxa	[%g0]ASI_AFAR, %o1		! primary afar reg
+	retl
+	stx	%o1, [%o0 + CH_CPU_ERRORS_AFAR]
+	SET_SIZE(get_cpu_error_state)
+#endif	/* lint */
+
+#if defined(lint)
+
+/*
+ * Check a page of memory for errors.
+ *
+ * Load each 64 byte block from physical memory.
+ * Check AFSR after each load to see if an error
+ * was caused. If so, log/scrub that error.
+ *
+ * Used to determine if a page contains
+ * CEs when CEEN is disabled.
+ */
+/*ARGSUSED*/
+void
+cpu_check_block(caddr_t va, uint_t psz)
+{}
+
+#else	/* lint */
+
+	ENTRY(cpu_check_block)
+	!
+	! get a new window with room for the error regs
+	!
+	save	%sp, -SA(MINFRAME + CH_CPU_ERROR_SIZE), %sp
+	srl	%i1, 6, %l4		! clear top bits of psz
+					! and divide by 64
+	rd	%fprs, %l2		! store FP
+	wr	%g0, FPRS_FEF, %fprs	! enable FP
+1:
+	ldda	[%i0]ASI_BLK_P, %d0	! load a block
+	membar	#Sync
+	ldxa    [%g0]ASI_AFSR, %l3	! read afsr reg
+	brz,a,pt %l3, 2f		! check for error
+	nop
+
+	!
+	! if error, read the error regs and log it
+	!
+	call	get_cpu_error_state
+	add	%fp, STACK_BIAS - CH_CPU_ERROR_SIZE, %o0
+
+	!
+	! cpu_ce_detected(ch_cpu_errors_t *, flag)
+	!
+	call	cpu_ce_detected		! log the error
+	mov	CE_CEEN_TIMEOUT, %o1
+2:
+	dec	%l4			! next 64-byte block
+	brnz,a,pt  %l4, 1b
+	add	%i0, 64, %i0		! increment block addr
+
+	wr	%l2, %g0, %fprs		! restore FP
+	ret
+	restore
+
+	SET_SIZE(cpu_check_block)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+/*
+ * Perform a cpu logout called from C.  This is used where we did not trap
+ * for the error but still want to gather "what we can".  Caller must make
+ * sure cpu private area exists and that the indicated logout area is free
+ * for use, and that we are unable to migrate cpus.
+ */
+/*ARGSUSED*/
+void
+cpu_delayed_logout(uint64_t afar, ch_cpu_logout_t *clop)
+{ }
+
+#else
+	ENTRY(cpu_delayed_logout)
+	rdpr	%pstate, %o2
+	andn	%o2, PSTATE_IE, %o2
+	wrpr	%g0, %o2, %pstate		! disable interrupts
+	PARK_SIBLING_CORE(%o2, %o3, %o4)	! %o2 has DCU value
+	add	%o1, CH_CLO_DATA + CH_CHD_EC_DATA, %o1
+	rd	%asi, %g1
+	wr	%g0, ASI_P, %asi
+	GET_ECACHE_DTAGS(%o0, %o1, %o3, %o4, %o5)
+	wr	%g1, %asi
+	UNPARK_SIBLING_CORE(%o2, %o3, %o4)	! can use %o2 again
+	rdpr	%pstate, %o2
+	or	%o2, PSTATE_IE, %o2
+	wrpr	%g0, %o2, %pstate
+	retl
+	  nop
+	SET_SIZE(cpu_delayed_logout)
+
+#endif	/* lint */
+
+#if defined(lint)
+
+/*ARGSUSED*/
+int
+dtrace_blksuword32(uintptr_t addr, uint32_t *data, int tryagain)
+{ return (0); }
+
+#else
+
+	ENTRY(dtrace_blksuword32)
+	save	%sp, -SA(MINFRAME + 4), %sp
+
+	rdpr	%pstate, %l1
+	andn	%l1, PSTATE_IE, %l2		! disable interrupts to
+	wrpr	%g0, %l2, %pstate		! protect our FPU diddling
+
+	rd	%fprs, %l0
+	andcc	%l0, FPRS_FEF, %g0
+	bz,a,pt	%xcc, 1f			! if the fpu is disabled
+	wr	%g0, FPRS_FEF, %fprs		! ... enable the fpu
+
+	st	%f0, [%fp + STACK_BIAS - 4]	! save %f0 to the stack
+1:
+	set	0f, %l5
+        /*
+         * We're about to write a block full or either total garbage
+         * (not kernel data, don't worry) or user floating-point data
+         * (so it only _looks_ like garbage).
+         */
+	ld	[%i1], %f0			! modify the block
+	membar	#Sync
+	stn	%l5, [THREAD_REG + T_LOFAULT]	! set up the lofault handler
+	stda	%d0, [%i0]ASI_BLK_COMMIT_S	! store the modified block
+	membar	#Sync
+	stn	%g0, [THREAD_REG + T_LOFAULT]	! remove the lofault handler
+
+	bz,a,pt	%xcc, 1f
+	wr	%g0, %l0, %fprs			! restore %fprs
+
+	ld	[%fp + STACK_BIAS - 4], %f0	! restore %f0
+1:
+
+	wrpr	%g0, %l1, %pstate		! restore interrupts
+
+	ret
+	restore	%g0, %g0, %o0
+
+0:
+	membar	#Sync
+	stn	%g0, [THREAD_REG + T_LOFAULT]	! remove the lofault handler
+
+	bz,a,pt	%xcc, 1f
+	wr	%g0, %l0, %fprs			! restore %fprs
+
+	ld	[%fp + STACK_BIAS - 4], %f0	! restore %f0
+1:
+
+	wrpr	%g0, %l1, %pstate		! restore interrupts
+
+	/*
+	 * If tryagain is set (%i2) we tail-call dtrace_blksuword32_err()
+	 * which deals with watchpoints. Otherwise, just return -1.
+	 */
+	brnz,pt	%i2, 1f
+	nop
+	ret
+	restore	%g0, -1, %o0
+1:
+	call	dtrace_blksuword32_err
+	restore
+
+	SET_SIZE(dtrace_blksuword32)
+
+#endif /* lint */
+
+#ifdef	CHEETAHPLUS_ERRATUM_25
+
+#if	defined(lint)
+/*
+ * Claim a chunk of physical address space.
+ */
+/*ARGSUSED*/
+void
+claimlines(uint64_t pa, size_t sz, int stride)
+{}
+#else	/* lint */
+	ENTRY(claimlines)
+1:
+	subcc	%o1, %o2, %o1
+	add	%o0, %o1, %o3
+	bgeu,a,pt	%xcc, 1b
+	casxa	[%o3]ASI_MEM, %g0, %g0
+	membar  #Sync
+	retl
+	nop
+	SET_SIZE(claimlines)
+#endif	/* lint */
+
+#if	defined(lint)
+/*
+ * CPU feature initialization,
+ * turn BPE off,
+ * get device id.
+ */
+/*ARGSUSED*/
+void
+cpu_feature_init(void)
+{}
+#else	/* lint */
+	ENTRY(cpu_feature_init)
+	save	%sp, -SA(MINFRAME), %sp
+	sethi	%hi(cheetah_bpe_off), %o0
+	ld	[%o0 + %lo(cheetah_bpe_off)], %o0
+	brz	%o0, 1f
+	nop
+	rd	ASR_DISPATCH_CONTROL, %o0
+	andn	%o0, ASR_DISPATCH_CONTROL_BPE, %o0
+	wr	%o0, 0, ASR_DISPATCH_CONTROL
+1:
+	!
+	! get the device_id and store the device_id
+	! in the appropriate cpunodes structure
+	! given the cpus index
+	!
+	CPU_INDEX(%o0, %o1)
+	mulx %o0, CPU_NODE_SIZE, %o0
+	set  cpunodes + DEVICE_ID, %o1
+	ldxa [%g0] ASI_DEVICE_SERIAL_ID, %o2
+	stx  %o2, [%o0 + %o1]
+#ifdef	CHEETAHPLUS_ERRATUM_34
+	!
+	! apply Cheetah+ erratum 34 workaround
+	!
+	call itlb_erratum34_fixup
+	  nop
+#endif	/* CHEETAHPLUS_ERRATUM_34 */
+	ret
+	  restore
+	SET_SIZE(cpu_feature_init)
+#endif	/* lint */
+
+#if	defined(lint)
+/*
+ * Copy a tsb entry atomically, from src to dest.
+ * src must be 128 bit aligned.
+ */
+/*ARGSUSED*/
+void
+copy_tsb_entry(uintptr_t src, uintptr_t dest)
+{}
+#else	/* lint */
+	ENTRY(copy_tsb_entry)
+	ldda	[%o0]ASI_NQUAD_LD, %o2		! %o2 = tag, %o3 = data
+	stx	%o2, [%o1]
+	stx	%o3, [%o1 + 8 ]	
+	retl
+	nop
+	SET_SIZE(copy_tsb_entry)
+#endif	/* lint */
+
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+
+#ifdef	CHEETAHPLUS_ERRATUM_34
+
+#if	defined(lint)
+
+/*ARGSUSED*/
+void
+itlb_erratum34_fixup(void)
+{}
+
+#else	/* lint */
+
+	!
+	! In Cheetah+ erratum 34, under certain conditions an ITLB locked
+	! index 0 TTE will erroneously be displaced when a new TTE is
+	! loaded via ASI_ITLB_IN.  In order to avoid cheetah+ erratum 34,
+	! locked index 0 TTEs must be relocated.
+	!
+	! NOTE: Care must be taken to avoid an ITLB miss in this routine.
+	!
+	ENTRY_NP(itlb_erratum34_fixup)
+	rdpr	%pstate, %o3
+#ifdef DEBUG
+	andcc	%o3, PSTATE_IE, %g0		! If interrupts already
+	bnz,pt %icc, 0f				!   disabled, panic
+	  nop
+	sethi	%hi(sfmmu_panic1), %o0
+	call	panic
+	 or	%o0, %lo(sfmmu_panic1), %o0
+0:
+#endif /* DEBUG */
+	wrpr	%o3, PSTATE_IE, %pstate		! Disable interrupts
+	ldxa	[%g0]ASI_ITLB_ACCESS, %o1	! %o1 = entry 0 data
+	ldxa	[%g0]ASI_ITLB_TAGREAD, %o2	! %o2 = entry 0 tag
+
+	cmp	%o1, %g0			! Is this entry valid?
+	bge	%xcc, 1f
+	  andcc	%o1, TTE_LCK_INT, %g0		! Is this entry locked?
+	bnz	%icc, 2f
+	  nop
+1:
+	retl					! Nope, outta here...
+	  wrpr	%g0, %o3, %pstate		! Enable interrupts
+2:
+	sethi	%hi(FLUSH_ADDR), %o4
+	stxa	%g0, [%o2]ASI_ITLB_DEMAP	! Flush this mapping
+	flush	%o4				! Flush required for I-MMU
+	!
+	! Start search from index 1 up.  This is because the kernel force
+	! loads its text page at index 15 in sfmmu_kernel_remap() and we
+	! don't want our relocated entry evicted later.
+	!
+	! NOTE: We assume that we'll be successful in finding an unlocked
+	! or invalid entry.  If that isn't the case there are bound to
+	! bigger problems.
+	!
+	set	(1 << 3), %g3
+3:
+	ldxa	[%g3]ASI_ITLB_ACCESS, %o4	! Load TTE from t16
+	!
+	! If this entry isn't valid, we'll choose to displace it (regardless
+	! of the lock bit).
+	!
+	cmp	%o4, %g0			! TTE is > 0 iff not valid
+	bge	%xcc, 4f			! If invalid, go displace
+	  andcc	%o4, TTE_LCK_INT, %g0		! Check for lock bit
+	bnz,a	%icc, 3b			! If locked, look at next
+	  add	%g3, (1 << 3), %g3		!  entry
+4:
+	!
+	! We found an unlocked or invalid entry; we'll explicitly load
+	! the former index 0 entry here.
+	!
+	sethi	%hi(FLUSH_ADDR), %o4
+	set	MMU_TAG_ACCESS, %g4
+	stxa	%o2, [%g4]ASI_IMMU
+	stxa	%o1, [%g3]ASI_ITLB_ACCESS
+	flush	%o4				! Flush required for I-MMU
+	retl
+	  wrpr	%g0, %o3, %pstate		! Enable interrupts
+	SET_SIZE(itlb_erratum34_fixup)
+
+#endif	/* lint */
+
+#endif	/* CHEETAHPLUS_ERRATUM_34 */
+
diff --git a/usr/src/uts/sun4u/cpu/us3_common_mmu.c b/usr/src/uts/sun4u/cpu/us3_common_mmu.c
new file mode 100644
index 0000000000..c4bc9fc451
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_common_mmu.c
@@ -0,0 +1,661 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/archsystm.h>
+#include <sys/vmsystm.h>
+#include <sys/machparam.h>
+#include <sys/machsystm.h>
+#include <vm/vm_dep.h>
+#include <vm/hat_sfmmu.h>
+#include <vm/seg_kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/cpu_module.h>
+#include <sys/sysmacros.h>
+#include <sys/panic.h>
+
+/*
+ * Note that 'Cheetah PRM' refers to:
+ *   SPARC V9 JPS1 Implementation Supplement: Sun UltraSPARC-III
+ */
+
+/* Will be set !NULL for Cheetah+ and derivatives. */
+extern uchar_t *ctx_pgsz_array;
+
+/*
+ * pan_disable_ism_large_pages and pan_disable_large_pages are the Panther-
+ * specific versions of disable_ism_large_pages and disable_large_pages,
+ * and feed back into those two hat variables at hat initialization time,
+ * for Panther-only systems.
+ *
+ * chpjag_disable_ism_large_pages is the Ch/Jaguar-specific version of
+ * disable_ism_large_pages. Ditto for chjag_disable_large_pages.
+ */
+static int panther_only = 0;
+
+static int pan_disable_ism_large_pages = ((1 << TTE64K) |
+	(1 << TTE512K) | (1 << TTE256M));
+static int pan_disable_large_pages = (1 << TTE256M);
+static int pan_disable_auto_large_pages = (1 << TTE4M) | (1 << TTE256M);
+
+static int chjag_disable_ism_large_pages = ((1 << TTE64K) |
+	(1 << TTE512K) | (1 << TTE32M) | (1 << TTE256M));
+static int chjag_disable_large_pages = ((1 << TTE32M) | (1 << TTE256M));
+static int chjag_disable_auto_large_pages = ((1 << TTE32M) | (1 << TTE256M));
+
+/*
+ * The function returns the USIII-IV mmu-specific values for the
+ * hat's disable_large_pages and disable_ism_large_pages variables.
+ * Currently the hat's disable_large_pages and disable_ism_large_pages
+ * already contain the generic sparc 4 page size info, and the return
+ * values are or'd with those values.
+ */
+int
+mmu_large_pages_disabled(uint_t flag)
+{
+	int pages_disable = 0;
+
+	if (panther_only) {
+		if (flag == HAT_LOAD) {
+			pages_disable = pan_disable_large_pages;
+		} else if (flag == HAT_LOAD_SHARE) {
+			pages_disable = pan_disable_ism_large_pages;
+		} else if (flag == HAT_LOAD_AUTOLPG) {
+			pages_disable = pan_disable_auto_large_pages;
+		}
+	} else {
+		if (flag == HAT_LOAD) {
+			pages_disable = chjag_disable_large_pages;
+		} else if (flag == HAT_LOAD_SHARE) {
+			pages_disable = chjag_disable_ism_large_pages;
+		} else if (flag == HAT_LOAD_AUTOLPG) {
+			pages_disable = chjag_disable_auto_large_pages;
+		}
+	}
+	return (pages_disable);
+}
+
+#if defined(CPU_IMP_DUAL_PAGESIZE)
+/*
+ * If a platform is running with only Ch+ or Jaguar, and then someone DR's
+ * in a Panther board, the Panther mmu will not like it if one of the already
+ * running threads is context switched to the Panther and tries to program
+ * a 512K or 4M page into the T512_1. So make these platforms pay the price
+ * and follow the Panther DTLB restrictions by default. :)
+ * The mmu_init_mmu_page_sizes code below takes care of heterogeneous
+ * platforms that don't support DR, like daktari.
+ *
+ * The effect of these restrictions is to limit the allowable values in
+ * sfmmu_pgsz[0] and sfmmu_pgsz[1], since these hat variables are used in
+ * mmu_set_ctx_page_sizes to set up the values in the ctx_pgsz_array that
+ * are used at context switch time. The value in sfmmu_pgsz[0] is used in
+ * P_pgsz0 and sfmmu_pgsz[1] is used in P_pgsz1, as per Figure F-1-1
+ * IMMU and DMMU Primary Context Register in the Panther Implementation
+ * Supplement and Table 15-21 DMMU Primary Context Register in the
+ * Cheetah+ Delta PRM.
+ */
+#ifdef MIXEDCPU_DR_SUPPORTED
+int panther_dtlb_restrictions = 1;
+#else
+int panther_dtlb_restrictions = 0;
+#endif /* MIXEDCPU_DR_SUPPORTED */
+
+/*
+ * init_mmu_page_sizes is set to one after the bootup time initialization
+ * via mmu_init_mmu_page_sizes, to indicate that mmu_page_sizes has a
+ * valid value.
+ */
+int init_mmu_page_sizes = 0;
+
+/*
+ * mmu_init_large_pages is called with the desired ism_pagesize parameter,
+ * for Panther-only systems. It may be called from set_platform_defaults,
+ * if some value other than 32M is desired, for Panther-only systems.
+ * mmu_ism_pagesize is the tunable.  If it has a bad value, then only warn,
+ * since it would be bad form to panic due
+ * to a user typo.
+ *
+ * The function re-initializes the pan_disable_ism_large_pages and
+ * pan_disable_large_pages variables, which are closely related.
+ * Aka, if 32M is the desired [D]ISM page sizes, then 256M cannot be allowed
+ * for non-ISM large page usage, or DTLB conflict will occur. Please see the
+ * Panther PRM for additional DTLB technical info.
+ */
+void
+mmu_init_large_pages(size_t ism_pagesize)
+{
+	if (ctx_pgsz_array == NULL) {	/* disable_dual_pgsz flag */
+		pan_disable_ism_large_pages = ((1 << TTE64K) |
+			(1 << TTE512K) | (1 << TTE32M) | (1 << TTE256M));
+		pan_disable_large_pages = ((1 << TTE32M) | (1 << TTE256M));
+		auto_lpg_maxszc = TTE4M;
+		return;
+	}
+
+	switch (ism_pagesize) {
+	case MMU_PAGESIZE4M:
+		pan_disable_ism_large_pages = ((1 << TTE64K) |
+			(1 << TTE512K) | (1 << TTE32M) | (1 << TTE256M));
+		pan_disable_large_pages = (1 << TTE256M);
+		pan_disable_auto_large_pages = (1 << TTE32M) | (1 << TTE256M);
+		auto_lpg_maxszc = TTE4M;
+		break;
+	case MMU_PAGESIZE32M:
+		pan_disable_ism_large_pages = ((1 << TTE64K) |
+			(1 << TTE512K) | (1 << TTE256M));
+		pan_disable_large_pages = (1 << TTE256M);
+		pan_disable_auto_large_pages = (1 << TTE4M) | (1 << TTE256M);
+		auto_lpg_maxszc = TTE32M;
+		break;
+	case MMU_PAGESIZE256M:
+		pan_disable_ism_large_pages = ((1 << TTE64K) |
+			(1 << TTE512K) | (1 << TTE32M));
+		pan_disable_large_pages = (1 << TTE32M);
+		pan_disable_auto_large_pages = (1 << TTE4M) | (1 << TTE32M);
+		auto_lpg_maxszc = TTE256M;
+		break;
+	default:
+		cmn_err(CE_WARN, "Unrecognized mmu_ism_pagesize value 0x%lx",
+			ism_pagesize);
+		break;
+	}
+}
+
+/*
+ * Re-initialize mmu_page_sizes and friends, for Panther mmu support.
+ * Called during very early bootup from check_cpus_set().
+ * Can be called to verify that mmu_page_sizes are set up correctly.
+ * Note that ncpus is not initialized at this point in the bootup sequence.
+ */
+int
+mmu_init_mmu_page_sizes(int cinfo)
+{
+	int npanther = cinfo;
+
+	if (!init_mmu_page_sizes) {
+		if (npanther == ncpunode) {
+			mmu_page_sizes = MMU_PAGE_SIZES;
+			mmu_hashcnt = MAX_HASHCNT;
+			mmu_ism_pagesize = MMU_PAGESIZE32M;
+			mmu_exported_pagesize_mask = (1 << TTE8K) |
+			    (1 << TTE64K) | (1 << TTE512K) | (1 << TTE4M) |
+			    (1 << TTE32M) | (1 << TTE256M);
+			panther_dtlb_restrictions = 1;
+			panther_only = 1;
+			auto_lpg_maxszc = TTE32M;
+		} else if (npanther > 0) {
+			panther_dtlb_restrictions = 1;
+		}
+		auto_lpg_maxszc = mmu_page_sizes - 1;
+		init_mmu_page_sizes = 1;
+		return (0);
+	}
+	return (1);
+}
+
+
+/* Cheetah+ and later worst case DTLB parameters */
+#ifndef	LOCKED_DTLB_ENTRIES
+#define	LOCKED_DTLB_ENTRIES	5	/* 2 user TSBs, 2 nucleus, + OBP */
+#endif
+#define	TOTAL_DTLB_ENTRIES	16
+#define	AVAIL_32M_ENTRIES	0
+#define	AVAIL_256M_ENTRIES	0
+#define	AVAIL_DTLB_ENTRIES	(TOTAL_DTLB_ENTRIES - LOCKED_DTLB_ENTRIES)
+static uint64_t ttecnt_threshold[MMU_PAGE_SIZES] = {
+	AVAIL_DTLB_ENTRIES, AVAIL_DTLB_ENTRIES,
+	AVAIL_DTLB_ENTRIES, AVAIL_DTLB_ENTRIES,
+	AVAIL_32M_ENTRIES, AVAIL_256M_ENTRIES };
+
+/*ARGSUSED*/
+uint_t
+mmu_preferred_pgsz(struct hat *hat, caddr_t addr, size_t len)
+{
+	sfmmu_t *sfmmup = (sfmmu_t *)hat;
+	uint_t pgsz0, pgsz1;
+	uint_t szc, maxszc = mmu_page_sizes - 1;
+	size_t pgsz;
+	extern int disable_large_pages;
+
+	pgsz0 = (uint_t)sfmmup->sfmmu_pgsz[0];
+	pgsz1 = (uint_t)sfmmup->sfmmu_pgsz[1];
+
+	/*
+	 * If either of the TLBs are reprogrammed, choose
+	 * the largest mapping size as the preferred size,
+	 * if it fits the size and alignment constraints.
+	 * Else return the largest mapping size that fits,
+	 * if neither TLB is reprogrammed.
+	 */
+	if (pgsz0 > TTE8K || pgsz1 > TTE8K) {
+		if (pgsz1 > pgsz0) {	/* First try pgsz1 */
+			pgsz = hw_page_array[pgsz1].hp_size;
+			if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
+				return (pgsz1);
+		}
+		if (pgsz0 > TTE8K) {	/* Then try pgsz0, if !TTE8K */
+			pgsz = hw_page_array[pgsz0].hp_size;
+			if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
+				return (pgsz0);
+		}
+	} else { /* Otherwise pick best fit if neither TLB is reprogrammed. */
+		for (szc = maxszc; szc > TTE8K; szc--) {
+			if (disable_large_pages & (1 << szc))
+				continue;
+
+			pgsz = hw_page_array[szc].hp_size;
+			if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
+				return (szc);
+		}
+	}
+	return (TTE8K);
+}
+
+/*
+ * The purpose of this code is to indirectly reorganize the sfmmu_pgsz array
+ * in order to handle the Panther mmu DTLB requirements. Panther only supports
+ * the 32M/256M pages in the T512_1 and not in the T16, so the Panther cpu
+ * can only support one of the two largest page sizes at a time (efficiently).
+ * Panther only supports 512K and 4M pages in the T512_0, and 32M/256M pages
+ * in the T512_1.  So check the sfmmu flags and ttecnt before enabling
+ * the T512_1 for 32M or 256M page sizes, and make sure that 512K and 4M
+ * requests go to the T512_0.
+ *
+ * The tmp_pgsz array comes into this routine in sorted order, as it is
+ * sorted from largest to smallest #pages per pagesize in use by the hat code,
+ * and leaves with the Panther mmu DTLB requirements satisfied. Note that
+ * when the array leaves this function it may not contain all of the page
+ * size codes that it had coming into the function.
+ *
+ * Note that for DISM the flag can be set but the ttecnt can be 0, if we
+ * didn't fault any pages in. This allows the t512_1 to be reprogrammed,
+ * because the T16 does not support the two giant page sizes. ouch.
+ */
+void
+mmu_fixup_large_pages(struct hat *hat, uint64_t *ttecnt, uint8_t *tmp_pgsz)
+{
+	uint_t pgsz0 = tmp_pgsz[0];
+	uint_t pgsz1 = tmp_pgsz[1];
+	uint_t spgsz;
+
+	/*
+	 * Don't program 2nd dtlb for kernel and ism hat
+	 */
+	ASSERT(hat->sfmmu_ismhat == NULL);
+	ASSERT(hat != ksfmmup);
+	ASSERT(ctx_pgsz_array != NULL);
+
+	ASSERT((!SFMMU_FLAGS_ISSET(hat, HAT_32M_FLAG)) ||
+		(!SFMMU_FLAGS_ISSET(hat, HAT_256M_FLAG)));
+
+	if ((SFMMU_FLAGS_ISSET(hat, HAT_32M_FLAG)) || (ttecnt[TTE32M] != 0)) {
+		spgsz = pgsz1;
+		pgsz1 = TTE32M;
+		if (pgsz0 == TTE32M)
+			pgsz0 = spgsz;
+	} else if ((SFMMU_FLAGS_ISSET(hat, HAT_256M_FLAG)) ||
+	    (ttecnt[TTE256M] != 0)) {
+		spgsz = pgsz1;
+		pgsz1 = TTE256M;
+		if (pgsz0 == TTE256M)
+			pgsz0 = spgsz;
+	} else if ((pgsz1 == TTE512K) || (pgsz1 == TTE4M)) {
+		if ((pgsz0 != TTE512K) && (pgsz0 != TTE4M)) {
+			spgsz = pgsz0;
+			pgsz0 = pgsz1;
+			pgsz1 = spgsz;
+		} else {
+			pgsz1 = page_szc(MMU_PAGESIZE);
+		}
+	}
+	/*
+	 * This implements PAGESIZE programming of the T8s
+	 * if large TTE counts don't exceed the thresholds.
+	 */
+	if (ttecnt[pgsz0] < ttecnt_threshold[pgsz0])
+		pgsz0 = page_szc(MMU_PAGESIZE);
+	if (ttecnt[pgsz1] < ttecnt_threshold[pgsz1])
+		pgsz1 = page_szc(MMU_PAGESIZE);
+	tmp_pgsz[0] = pgsz0;
+	tmp_pgsz[1] = pgsz1;
+}
+
+/*
+ * Function to set up the page size values used to reprogram the DTLBs,
+ * when page sizes used by a process change significantly.
+ */
+void
+mmu_setup_page_sizes(struct hat *hat, uint64_t *ttecnt, uint8_t *tmp_pgsz)
+{
+	uint_t pgsz0, pgsz1;
+
+	/*
+	 * Don't program 2nd dtlb for kernel and ism hat
+	 */
+	ASSERT(hat->sfmmu_ismhat == NULL);
+	ASSERT(hat != ksfmmup);
+
+	if (ctx_pgsz_array == NULL)	/* disable_dual_pgsz flag */
+		return;
+
+	/*
+	 * hat->sfmmu_pgsz[] is an array whose elements
+	 * contain a sorted order of page sizes.  Element
+	 * 0 is the most commonly used page size, followed
+	 * by element 1, and so on.
+	 *
+	 * ttecnt[] is an array of per-page-size page counts
+	 * mapped into the process.
+	 *
+	 * If the HAT's choice for page sizes is unsuitable,
+	 * we can override it here.  The new values written
+	 * to the array will be handed back to us later to
+	 * do the actual programming of the TLB hardware.
+	 *
+	 * The policy we use for programming the dual T8s on
+	 * Cheetah+ and beyond is as follows:
+	 *
+	 *   We have two programmable TLBs, so we look at
+	 *   the two most common page sizes in the array, which
+	 *   have already been computed for us by the HAT.
+	 *   If the TTE count of either of a preferred page size
+	 *   exceeds the number of unlocked T16 entries,
+	 *   we reprogram one of the T8s to that page size
+	 *   to avoid thrashing in the T16.  Else we program
+	 *   that T8 to the base page size.  Note that we do
+	 *   not force either T8 to be the base page size if a
+	 *   process is using more than two page sizes.  Policy
+	 *   decisions about which page sizes are best to use are
+	 *   left to the upper layers.
+	 *
+	 *   Note that for Panther, 4M and 512K pages need to be
+	 *   programmed into T512_0, and 32M and 256M into T512_1,
+	 *   so we don't want to go through the MIN/MAX code.
+	 *   For partial-Panther systems, we still want to make sure
+	 *   that 4M and 512K page sizes NEVER get into the T512_1.
+	 *   Since the DTLB flags are not set up on a per-cpu basis,
+	 *   Panther rules must be applied for mixed Panther/Cheetah+/
+	 *   Jaguar configurations.
+	 */
+	if (panther_dtlb_restrictions) {
+		if ((tmp_pgsz[1] == TTE512K) || (tmp_pgsz[1] == TTE4M)) {
+			if ((tmp_pgsz[0] != TTE512K) &&
+			    (tmp_pgsz[0] != TTE4M)) {
+				pgsz1 = tmp_pgsz[0];
+				pgsz0 = tmp_pgsz[1];
+			} else {
+				pgsz0 = tmp_pgsz[0];
+				pgsz1 = page_szc(MMU_PAGESIZE);
+			}
+		} else {
+			pgsz0 = tmp_pgsz[0];
+			pgsz1 = tmp_pgsz[1];
+		}
+	} else {
+		pgsz0 = MIN(tmp_pgsz[0], tmp_pgsz[1]);
+		pgsz1 = MAX(tmp_pgsz[0], tmp_pgsz[1]);
+	}
+
+	/*
+	 * This implements PAGESIZE programming of the T8s
+	 * if large TTE counts don't exceed the thresholds.
+	 */
+	if (ttecnt[pgsz0] < ttecnt_threshold[pgsz0])
+		pgsz0 = page_szc(MMU_PAGESIZE);
+	if (ttecnt[pgsz1] < ttecnt_threshold[pgsz1])
+		pgsz1 = page_szc(MMU_PAGESIZE);
+	tmp_pgsz[0] = pgsz0;
+	tmp_pgsz[1] = pgsz1;
+}
+
+/*
+ * The HAT calls this function when an MMU context is allocated so that we
+ * can reprogram the large TLBs appropriately for the new process using
+ * the context.
+ *
+ * The caller must hold the HAT lock.
+ */
+void
+mmu_set_ctx_page_sizes(struct hat *hat)
+{
+	uint_t pgsz0, pgsz1;
+	uint_t new_cext;
+
+	ASSERT(sfmmu_hat_lock_held(hat));
+	ASSERT(hat != ksfmmup);
+
+	if (ctx_pgsz_array == NULL)	/* disable_dual_pgsz flag */
+		return;
+
+	/*
+	 * If supported, reprogram the TLBs to a larger pagesize.
+	 */
+	pgsz0 = hat->sfmmu_pgsz[0];
+	pgsz1 = hat->sfmmu_pgsz[1];
+	ASSERT(pgsz0 < mmu_page_sizes);
+	ASSERT(pgsz1 < mmu_page_sizes);
+#ifdef DEBUG
+	if (panther_dtlb_restrictions) {
+		ASSERT(pgsz1 != TTE512K);
+		ASSERT(pgsz1 != TTE4M);
+	}
+	if (panther_only) {
+		ASSERT(pgsz0 != TTE32M);
+		ASSERT(pgsz0 != TTE256M);
+	}
+#endif /* DEBUG */
+	new_cext = TAGACCEXT_MKSZPAIR(pgsz1, pgsz0);
+	if (hat->sfmmu_cext != new_cext) {
+		hat->sfmmu_cext = new_cext;
+	}
+	ctx_pgsz_array[hat->sfmmu_cnum] = hat->sfmmu_cext;
+	/*
+	 * sfmmu_setctx_sec() will take care of the
+	 * rest of the chores reprogramming the ctx_pgsz_array
+	 * page size values into the DTLBs.
+	 */
+}
+
+/*
+ * This function assumes that there are either four or six supported page
+ * sizes and at most two programmable TLBs, so we need to decide which
+ * page sizes are most important and then adjust the TLB page sizes
+ * accordingly (if supported).
+ *
+ * If these assumptions change, this function will need to be
+ * updated to support whatever the new limits are.
+ */
+void
+mmu_check_page_sizes(sfmmu_t *sfmmup, uint64_t *ttecnt)
+{
+	uint64_t sortcnt[MMU_PAGE_SIZES];
+	uint8_t tmp_pgsz[MMU_PAGE_SIZES];
+	uint8_t i, j, max;
+	uint16_t oldval, newval;
+
+	/*
+	 * We only consider reprogramming the TLBs if one or more of
+	 * the two most used page sizes changes and we're using
+	 * large pages in this process, except for Panther 32M/256M pages,
+	 * which the Panther T16 does not support.
+	 */
+	if (sfmmup->sfmmu_flags & HAT_LGPG_FLAGS) {
+		/* Sort page sizes. */
+		for (i = 0; i < mmu_page_sizes; i++) {
+			sortcnt[i] = ttecnt[i];
+		}
+		for (j = 0; j < mmu_page_sizes; j++) {
+			for (i = mmu_page_sizes - 1, max = 0; i > 0; i--) {
+				if (sortcnt[i] > sortcnt[max])
+					max = i;
+			}
+			tmp_pgsz[j] = max;
+			sortcnt[max] = 0;
+		}
+
+		/*
+		 * Handle Panther page dtlb calcs separately. The check
+		 * for actual or potential 32M/256M pages must occur
+		 * every time due to lack of T16 support for them.
+		 * The sort works fine for Ch+/Jag, but Panther has
+		 * pagesize restrictions for both DTLBs.
+		 */
+		oldval = sfmmup->sfmmu_pgsz[0] << 8 | sfmmup->sfmmu_pgsz[1];
+
+		if (panther_only) {
+			mmu_fixup_large_pages(sfmmup, ttecnt, tmp_pgsz);
+		} else {
+			/* Check 2 largest values after the sort. */
+			mmu_setup_page_sizes(sfmmup, ttecnt, tmp_pgsz);
+		}
+		newval = tmp_pgsz[0] << 8 | tmp_pgsz[1];
+		if (newval != oldval) {
+			sfmmu_steal_context(sfmmup, tmp_pgsz);
+		}
+	}
+}
+
+#endif	/* CPU_IMP_DUAL_PAGESIZE */
+
+struct heap_lp_page_size {
+	int    impl;
+	uint_t tte;
+	int    use_dt512;
+};
+
+struct heap_lp_page_size heap_lp_pgsz[] = {
+
+	{CHEETAH_IMPL, TTE8K, 0},		/* default */
+	{CHEETAH_IMPL, TTE64K, 0},
+	{CHEETAH_IMPL, TTE4M, 0},
+
+	{ CHEETAH_PLUS_IMPL, TTE4M,  1 },	/* default */
+	{ CHEETAH_PLUS_IMPL, TTE4M,  0 },
+	{ CHEETAH_PLUS_IMPL, TTE64K, 1 },
+	{ CHEETAH_PLUS_IMPL, TTE64K, 0 },
+	{ CHEETAH_PLUS_IMPL, TTE8K,  0 },
+
+	{ JALAPENO_IMPL, TTE4M,  1 },		/* default */
+	{ JALAPENO_IMPL, TTE4M,  0 },
+	{ JALAPENO_IMPL, TTE64K, 1 },
+	{ JALAPENO_IMPL, TTE64K, 0 },
+	{ JALAPENO_IMPL, TTE8K,  0 },
+
+	{ JAGUAR_IMPL, TTE4M, 1 },		/* default */
+	{ JAGUAR_IMPL, TTE4M, 0 },
+	{ JAGUAR_IMPL, TTE64K, 1 },
+	{ JAGUAR_IMPL, TTE64K, 0 },
+	{ JAGUAR_IMPL, TTE8K, 0 },
+
+	{ SERRANO_IMPL, TTE4M,  1 },		/* default */
+	{ SERRANO_IMPL, TTE4M,  0 },
+	{ SERRANO_IMPL, TTE64K, 1 },
+	{ SERRANO_IMPL, TTE64K, 0 },
+	{ SERRANO_IMPL, TTE8K,  0 },
+
+	{ PANTHER_IMPL, TTE4M, 1 },		/* default */
+	{ PANTHER_IMPL, TTE4M, 0 },
+	{ PANTHER_IMPL, TTE64K, 1 },
+	{ PANTHER_IMPL, TTE64K, 0 },
+	{ PANTHER_IMPL, TTE8K, 0 }
+};
+
+int	heaplp_use_dt512 = -1;
+
+void
+mmu_init_kernel_pgsz(struct hat *hat)
+{
+	uint_t tte = page_szc(segkmem_lpsize);
+	uchar_t new_cext_primary, new_cext_nucleus;
+
+	if (heaplp_use_dt512 == 0 || tte > TTE4M) {
+		/* do not reprogram dt512 tlb */
+		tte = TTE8K;
+	}
+
+	new_cext_nucleus = TAGACCEXT_MKSZPAIR(tte, TTE8K);
+	new_cext_primary = TAGACCEXT_MKSZPAIR(TTE8K, tte);
+
+	if (ctx_pgsz_array)
+		ctx_pgsz_array[KCONTEXT] = new_cext_primary;
+	hat->sfmmu_cext = new_cext_primary;
+	kcontextreg = ((uint64_t)new_cext_nucleus << CTXREG_NEXT_SHIFT) |
+		((uint64_t)new_cext_primary << CTXREG_EXT_SHIFT);
+	mmu_init_kcontext();
+}
+
+size_t
+mmu_get_kernel_lpsize(size_t lpsize)
+{
+	struct heap_lp_page_size *p_lpgsz, *pend_lpgsz;
+	int impl = cpunodes[getprocessorid()].implementation;
+	uint_t tte = TTE8K;
+
+	pend_lpgsz = (struct heap_lp_page_size *)
+	    ((char *)heap_lp_pgsz + sizeof (heap_lp_pgsz));
+
+	/* search for a valid segkmem_lpsize */
+	for (p_lpgsz = heap_lp_pgsz; p_lpgsz < pend_lpgsz; p_lpgsz++) {
+		if (impl != p_lpgsz->impl)
+			continue;
+
+		if (lpsize == 0) {
+			/*
+			 * no setting for segkmem_lpsize in /etc/system
+			 * use default from the table
+			 */
+			tte = p_lpgsz->tte;
+			heaplp_use_dt512 = p_lpgsz->use_dt512;
+			break;
+		}
+
+		if (lpsize == TTEBYTES(p_lpgsz->tte) &&
+		    (heaplp_use_dt512 == -1 ||
+			heaplp_use_dt512 == p_lpgsz->use_dt512)) {
+
+			tte = p_lpgsz->tte;
+			heaplp_use_dt512 = p_lpgsz->use_dt512;
+
+			/* found a match */
+			break;
+		}
+	}
+
+	if (p_lpgsz == pend_lpgsz) {
+		/* nothing found: disable large page kernel heap */
+		tte = TTE8K;
+		heaplp_use_dt512 = 0;
+	}
+
+	lpsize = TTEBYTES(tte);
+
+	return (lpsize);
+}
diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno.c b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
new file mode 100644
index 0000000000..016604efcd
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
@@ -0,0 +1,904 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/ddi.h>
+#include <sys/sysmacros.h>
+#include <sys/archsystm.h>
+#include <sys/vmsystm.h>
+#include <sys/machparam.h>
+#include <sys/machsystm.h>
+#include <sys/machthread.h>
+#include <sys/cpu.h>
+#include <sys/cmp.h>
+#include <sys/elf_SPARC.h>
+#include <vm/hat_sfmmu.h>
+#include <vm/seg_kmem.h>
+#include <sys/cpuvar.h>
+#include <sys/cheetahregs.h>
+#include <sys/us3_module.h>
+#include <sys/async.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dditypes.h>
+#include <sys/prom_debug.h>
+#include <sys/prom_plat.h>
+#include <sys/cpu_module.h>
+#include <sys/sysmacros.h>
+#include <sys/intreg.h>
+#include <sys/clock.h>
+#include <sys/platform_module.h>
+#include <sys/machtrap.h>
+#include <sys/ontrap.h>
+#include <sys/panic.h>
+#include <sys/memlist.h>
+#include <sys/bootconf.h>
+#include <sys/ivintr.h>
+#include <sys/atomic.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/cpu/UltraSPARC-III.h>
+#include <sys/errclassify.h>
+
+#ifdef	CHEETAHPLUS_ERRATUM_25
+#include <sys/cyclic.h>
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+
+/* cpu estar private data */
+typedef struct {
+	uint8_t state : 7;
+	uint8_t valid : 1;
+} mcu_fsm_def_t;
+mcu_fsm_def_t mcu_fsm_init_state[NCPU];
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+/*
+ * jp_errata_85_enable can be set to 0 in /etc/system to disable
+ * JP Errata 85 workaround.
+ *
+ * jp_errata_85_allow_slow_scrub is usually set to !jp_errata_85_enable,
+ * but can be overridden in /etc/system.  If set, it allows the scrubber
+ * to run in 1/2 or 1/32 mode.  If a cpu is vulnerable to errata 85,
+ * this value should be zero.
+ *
+ * jp_errata_85_active is an internal variable and must not be
+ * set/changed via /etc/system or in any other way.
+ */
+extern int	jp_errata_85_enable;	/* for /etc/system use */
+extern int	jp_errata_85_allow_slow_scrub;	/* for /etc/system use */
+
+int	jp_errata_85_active = -1;	/* warn: modified in code ONLY */
+uint64_t	jp_estar_tl0_data[8];
+uint64_t	jp_estar_tl1_data[8];
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+/*
+ * Setup trap handlers.
+ */
+void
+cpu_init_trap(void)
+{
+	CH_SET_TRAP(tt_pil15, ch_pil15_interrupt_instr);
+
+	CH_SET_TRAP(tt0_fecc, fecc_err_instr);
+	CH_SET_TRAP(tt1_fecc, fecc_err_tl1_instr);
+	CH_SET_TRAP(tt1_swtrap0, fecc_err_tl1_cont_instr);
+
+	CH_SET_TRAP(tt0_dperr, dcache_parity_instr);
+	CH_SET_TRAP(tt1_dperr, dcache_parity_tl1_instr);
+	CH_SET_TRAP(tt1_swtrap1, dcache_parity_tl1_cont_instr);
+
+	CH_SET_TRAP(tt0_iperr, icache_parity_instr);
+	CH_SET_TRAP(tt1_iperr, icache_parity_tl1_instr);
+	CH_SET_TRAP(tt1_swtrap2, icache_parity_tl1_cont_instr);
+}
+
+
+static int
+getintprop(dnode_t node, char *name, int deflt)
+{
+	int	value;
+
+	switch (prom_getproplen(node, name)) {
+	case sizeof (int):
+		(void) prom_getprop(node, name, (caddr_t)&value);
+		break;
+
+	default:
+		value = deflt;
+		break;
+	}
+
+	return (value);
+}
+
+/*
+ * Set the magic constants of the implementation.
+ */
+/*ARGSUSED*/
+void
+cpu_fiximp(dnode_t dnode)
+{
+	int i, a;
+	extern int vac_size, vac_shift;
+	extern uint_t vac_mask;
+
+	static struct {
+		char	*name;
+		int	*var;
+		int	defval;
+	} prop[] = {
+		"dcache-size", &dcache_size, CH_DCACHE_SIZE,
+		"dcache-line-size", &dcache_linesize, CH_DCACHE_LSIZE,
+		"icache-size", &icache_size, CH_ICACHE_SIZE,
+		"icache-line-size", &icache_linesize, CH_ICACHE_LSIZE,
+		"ecache-size", &ecache_size, JP_ECACHE_MAX_SIZE,
+		"ecache-line-size", &ecache_alignsize, JP_ECACHE_MAX_LSIZE,
+		"ecache-associativity", &ecache_associativity, JP_ECACHE_NWAY
+	};
+
+	for (i = 0; i < sizeof (prop) / sizeof (prop[0]); i++)
+		*prop[i].var = getintprop(dnode, prop[i].name, prop[i].defval);
+
+	ecache_setsize = ecache_size / ecache_associativity;
+
+	vac_size = CH_VAC_SIZE;
+	vac_mask = MMU_PAGEMASK & (vac_size - 1);
+	i = 0; a = vac_size;
+	while (a >>= 1)
+		++i;
+	vac_shift = i;
+	shm_alignment = vac_size;
+	vac = 1;
+}
+
+void
+send_mondo_set(cpuset_t set)
+{
+	int lo, busy, nack, shipped = 0;
+	uint16_t i, cpuids[IDSR_BN_SETS];
+	uint64_t idsr, nackmask = 0, busymask, curnack, curbusy;
+	uint64_t starttick, endtick, tick, lasttick;
+#ifdef	CHEETAHPLUS_ERRATUM_25
+	int recovered = 0;
+	int cpuid;
+#endif
+
+	ASSERT(!CPUSET_ISNULL(set));
+	starttick = lasttick = gettick();
+
+	/*
+	 * Lower 2 bits of the agent ID determine which BUSY/NACK pair
+	 * will be used for dispatching interrupt. For now, assume
+	 * there are no more than IDSR_BN_SETS CPUs, hence no aliasing
+	 * issues with respect to BUSY/NACK pair usage.
+	 */
+	for (i = 0; i < NCPU; i++)
+		if (CPU_IN_SET(set, i)) {
+			shipit(i, shipped /* ignored */);
+			nackmask |= IDSR_NACK_BIT(CPUID_TO_BN_PAIR(i));
+			cpuids[CPUID_TO_BN_PAIR(i)] = i;
+			shipped++;
+			CPUSET_DEL(set, i);
+			if (CPUSET_ISNULL(set))
+				break;
+		}
+	CPU_STATS_ADDQ(CPU, sys, xcalls, shipped);
+
+	busymask = IDSR_NACK_TO_BUSY(nackmask);
+	busy = nack = 0;
+	endtick = starttick + xc_tick_limit;
+	for (;;) {
+		idsr = getidsr();
+		if (idsr == 0)
+			break;
+		tick = gettick();
+		/*
+		 * If there is a big jump between the current tick
+		 * count and lasttick, we have probably hit a break
+		 * point.  Adjust endtick accordingly to avoid panic.
+		 */
+		if (tick > (lasttick + xc_tick_jump_limit))
+			endtick += (tick - lasttick);
+		lasttick = tick;
+		if (tick > endtick) {
+			if (panic_quiesce)
+				return;
+#ifdef	CHEETAHPLUS_ERRATUM_25
+			cpuid = -1;
+			for (i = 0; i < IDSR_BN_SETS; i++) {
+				if (idsr & (IDSR_NACK_BIT(i) |
+				    IDSR_BUSY_BIT(i))) {
+					cpuid = cpuids[i];
+					break;
+				}
+			}
+			if (cheetah_sendmondo_recover && cpuid != -1 &&
+			    recovered == 0) {
+				if (mondo_recover(cpuid, i)) {
+					/*
+					 * We claimed the whole memory or
+					 * full scan is disabled.
+					 */
+					recovered++;
+				}
+				tick = gettick();
+				endtick = tick + xc_tick_limit;
+				lasttick = tick;
+				/*
+				 * Recheck idsr
+				 */
+				continue;
+			} else
+#endif	/* CHEETAHPLUS_ERRATUM_25 */
+			{
+				cmn_err(CE_CONT, "send mondo timeout "
+				    "[%d NACK %d BUSY]\nIDSR 0x%"
+				    "" PRIx64 "  cpuids:", nack, busy, idsr);
+				for (i = 0; i < IDSR_BN_SETS; i++) {
+					if (idsr & (IDSR_NACK_BIT(i) |
+					    IDSR_BUSY_BIT(i))) {
+						cmn_err(CE_CONT, " 0x%x",
+						    cpuids[i]);
+					}
+				}
+				cmn_err(CE_CONT, "\n");
+				cmn_err(CE_PANIC, "send_mondo_set: timeout");
+			}
+		}
+		curnack = idsr & nackmask;
+		curbusy = idsr & busymask;
+		if (curbusy) {
+			busy++;
+			continue;
+		}
+
+#ifdef SEND_MONDO_STATS
+		{
+			int n = gettick() - starttick;
+			if (n < 8192)
+				x_nack_stimes[n >> 7]++;
+		}
+#endif
+		while (gettick() < (tick + sys_clock_mhz))
+			;
+		do {
+			lo = lowbit(curnack) - 1;
+			i = IDSR_NACK_IDX(lo);
+			shipit(cpuids[i], i);
+			curnack &= ~(1ull << lo);
+		} while (curnack);
+		nack++;
+		busy = 0;
+	}
+#ifdef SEND_MONDO_STATS
+	{
+		int n = gettick() - starttick;
+		if (n < 8192)
+			x_set_stimes[n >> 7]++;
+		else
+			x_set_ltimes[(n >> 13) & 0xf]++;
+	}
+	x_set_cpus[shipped]++;
+#endif
+}
+
+/*
+ * Handles error logging for implementation specific error types
+ */
+int
+cpu_impl_async_log_err(void *flt, errorq_elem_t *eqep)
+{
+	ch_async_flt_t *ch_flt = (ch_async_flt_t *)flt;
+	struct async_flt *aflt = (struct async_flt *)flt;
+	page_t *pp;
+
+	switch (ch_flt->flt_type) {
+
+	case CPU_IC_PARITY:
+		cpu_async_log_ic_parity_err(flt);
+		return (CH_ASYNC_LOG_DONE);
+
+	case CPU_DC_PARITY:
+		cpu_async_log_dc_parity_err(flt);
+		return (CH_ASYNC_LOG_DONE);
+
+	case CPU_RCE:
+		pp = page_numtopp_nolock((pfn_t)
+		    (aflt->flt_addr >> MMU_PAGESHIFT));
+		if (pp) {
+			if (page_isretired(pp) || page_deteriorating(pp)) {
+				CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+				    CE_XDIAG_SKIP_PAGEDET);
+			} else if (ce_scrub_xdiag_recirc(aflt, ce_queue, eqep,
+			    offsetof(ch_async_flt_t, cmn_asyncflt))) {
+				return (CH_ASYNC_LOG_RECIRC);
+			}
+		} else {
+			CE_XDIAG_SETSKIPCODE(aflt->flt_disp,
+			    CE_XDIAG_SKIP_NOPP);
+		}
+		/*FALLTHRU*/
+	/*
+	 * cases where we just want to report the error and continue.
+	 */
+	case CPU_BPAR:
+	case CPU_UMS:
+	case CPU_FRC:
+	case CPU_FRU:
+		cpu_log_err(aflt);
+		return (CH_ASYNC_LOG_DONE);
+
+	/*
+	 * Cases where we want to fall through to handle panicking.
+	 */
+	case CPU_RUE:
+		cpu_log_err(aflt);
+		return (CH_ASYNC_LOG_CONTINUE);
+
+	default:
+		return (CH_ASYNC_LOG_UNKNOWN);
+	}
+}
+
+/*
+ * Figure out if Ecache is direct-mapped (Cheetah or Cheetah+ with Ecache
+ * control ECCR_ASSOC bit off or 2-way (Cheetah+ with ECCR_ASSOC on).
+ * We need to do this on the fly because we may have mixed Cheetah+'s with
+ * both direct and 2-way Ecaches.
+ */
+int
+cpu_ecache_nway(void)
+{
+	return (JP_ECACHE_NWAY);
+}
+
+/*
+ * Note that these are entered into the table in the order:
+ * Fatal Errors first, orphaned UCU/UCC, AFAR Overwrite policy,
+ * FRC/FRU, and finally IVPE.
+ *
+ * Afar overwrite policy is:
+ * Jalapeno:
+ *   UCU,UCC > RUE,UE,EDU,WDU,CPU,WBP,BP > RCE,CE,EDC,WDC,CPC >
+ *   TO,BERR > UMS,OM
+ * Serrano:
+ *   UCU,UCC > RUE,UE,EDU,WDU,CPU,WBP,BP > RCE,CE,EDC,WDC,CPC,ETI,ETC >
+ *   TO,BERR > UMS,OM
+ */
+ecc_type_to_info_t ecc_type_to_info[] = {
+
+	/* Fatal Errors */
+	C_AFSR_JETO,	"JETO ",	ECC_ALL_TRAPS,	CPU_FATAL,
+		"JETO Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_JETO,
+	C_AFSR_SCE,	"SCE ",		ECC_ALL_TRAPS,	CPU_FATAL,
+		"SCE Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_SCE,
+	C_AFSR_JEIC,	"JEIC ",	ECC_ALL_TRAPS,	CPU_FATAL,
+		"JEIC Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_JEIC,
+	C_AFSR_JEIT,	"JEIT ",	ECC_ALL_TRAPS,	CPU_FATAL,
+		"JEIT Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_JEIT,
+	C_AFSR_JEIS,	"JEIS ",	ECC_ALL_TRAPS,	CPU_FATAL,
+		"JEIS Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_JEIS,
+#if defined(JALAPENO)
+	C_AFSR_ETP,	"ETP ",		ECC_ALL_TRAPS,	CPU_FATAL,
+		"ETP Fatal",
+		FM_EREPORT_PAYLOAD_L2_TAG_PE,
+		FM_EREPORT_CPU_USIII_ETP,
+#elif defined(SERRANO)
+	C_AFSR_ETS,	"ETS ",		ECC_ASYNC_TRAPS, CPU_FATAL,
+		"ETS Fatal",
+		FM_EREPORT_PAYLOAD_L2_TAG_ECC,
+		FM_EREPORT_CPU_USIII_ETS,
+	C_AFSR_ETU,	"ETU ",		ECC_ASYNC_TRAPS, CPU_FATAL,
+		"ETU Fatal",
+		FM_EREPORT_PAYLOAD_L2_TAG_ECC,
+		FM_EREPORT_CPU_USIII_ETU,
+#endif	/* SERRANO */
+	C_AFSR_IERR,	"IERR ", 	ECC_ALL_TRAPS,	CPU_FATAL,
+		"IERR Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM2,
+		FM_EREPORT_CPU_USIII_IERR,
+	C_AFSR_ISAP,	"ISAP ",	ECC_ALL_TRAPS,	CPU_FATAL,
+		"ISAP Fatal",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_ISAP,
+
+	/* Orphaned UCU/UCC Errors */
+	C_AFSR_UCU,	"OUCU ",	ECC_ORPH_TRAPS, CPU_ORPH,
+		"Orphaned UCU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCU,
+	C_AFSR_UCC,	"OUCC ",	ECC_ORPH_TRAPS, CPU_ORPH,
+		"Orphaned UCC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCC,
+
+	/* UCU, UCC */
+	C_AFSR_UCU,	"UCU ",		ECC_F_TRAP,	CPU_UE_ECACHE,
+		"UCU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCU,
+	C_AFSR_UCC,	"UCC ",		ECC_F_TRAP,	CPU_CE_ECACHE,
+		"UCC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_UCC,
+
+
+	/* RUE, UE, EDU:ST, EDU:BLD, WDU, CPU, BP, WBP */
+	C_AFSR_RUE,	"RUE ",		ECC_ASYNC_TRAPS, CPU_RUE,
+		"Uncorrectable remote memory/cache (RUE)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_RUE,
+	C_AFSR_UE,	"UE ",		ECC_ASYNC_TRAPS, CPU_UE,
+		"Uncorrectable memory (UE)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_UE,
+	C_AFSR_EDU,	"EDU ",		ECC_C_TRAP,	CPU_UE_ECACHE_RETIRE,
+		"EDU:ST",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDUST,
+	C_AFSR_EDU,	"EDU ",		ECC_D_TRAP,	CPU_UE_ECACHE_RETIRE,
+		"EDU:BLD",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDUBL,
+	C_AFSR_WDU,	"WDU ",		ECC_C_TRAP,	CPU_UE_ECACHE_RETIRE,
+		"WDU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_WDU,
+	C_AFSR_CPU,	"CPU ",		ECC_C_TRAP,	CPU_UE_ECACHE,
+		"CPU",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_CPU,
+	C_AFSR_WBP,	"WBP ",		ECC_C_TRAP,	CPU_BPAR,
+		"JBUS parity error on writeback or block store (WBP)",
+		FM_EREPORT_PAYLOAD_SYSTEM3,
+		FM_EREPORT_CPU_USIII_WBP,
+	C_AFSR_BP,	"BP ",		ECC_ASYNC_TRAPS, CPU_BPAR,
+		"JBUS parity error on returned read data (BP)",
+		FM_EREPORT_PAYLOAD_SYSTEM3,
+		FM_EREPORT_CPU_USIII_BP,
+
+	/* RCE, CE, EDC, WDC, CPC */
+	C_AFSR_RCE,	"RCE ",		ECC_C_TRAP,	CPU_RCE,
+		"Corrected remote memory/cache (RCE)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_RCE,
+	C_AFSR_CE,	"CE ",		ECC_C_TRAP,	CPU_CE,
+		"Corrected memory (CE)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_CE,
+	C_AFSR_EDC,	"EDC ",		ECC_C_TRAP,	CPU_CE_ECACHE,
+		"EDC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_EDC,
+	C_AFSR_WDC,	"WDC ",		ECC_C_TRAP,	CPU_CE_ECACHE,
+		"WDC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_WDC,
+	C_AFSR_CPC,	"CPC ",		ECC_C_TRAP,	CPU_CE_ECACHE,
+		"CPC",
+		FM_EREPORT_PAYLOAD_L2_DATA,
+		FM_EREPORT_CPU_USIII_CPC,
+#if defined(SERRANO)
+	/* ETI, ETC */
+	C_AFSR_ETI,	"ETI",	ECC_F_TRAP | ECC_C_TRAP, CPU_CE_ECACHE,
+		"ETI",
+		FM_EREPORT_PAYLOAD_L2_TAG_ECC,
+		FM_EREPORT_CPU_USIII_ETI,
+	C_AFSR_ETC,	"ETC",	ECC_F_TRAP | ECC_C_TRAP, CPU_CE_ECACHE,
+		"ETC",
+		FM_EREPORT_PAYLOAD_L2_TAG_ECC,
+		FM_EREPORT_CPU_USIII_ETC,
+#endif	/* SERRANO */
+
+	/* TO, BERR */
+	C_AFSR_TO,	"TO ",		ECC_ASYNC_TRAPS, CPU_TO,
+		"Timeout (TO)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_TO,
+	C_AFSR_BERR,	"BERR ",	ECC_ASYNC_TRAPS, CPU_BERR,
+		"Bus Error (BERR)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_BERR,
+
+	/* UMS, OM */
+	C_AFSR_UMS,	"UMS ",		ECC_C_TRAP,	 CPU_UMS,
+		"Unsupported store (UMS)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_UMS,
+	C_AFSR_OM,	"OM ",		ECC_ASYNC_TRAPS, CPU_BERR,
+		"Out of range memory (OM)",
+		FM_EREPORT_PAYLOAD_IO,
+		FM_EREPORT_CPU_USIII_OM,
+
+	/* FRC, FRU */
+	C_AFSR_FRC,	"FRC ",		ECC_C_TRAP,	CPU_FRC,
+		"Corrected memory (FRC)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_FRC,
+	C_AFSR_FRU,	"FRU ",		ECC_C_TRAP,	 CPU_FRU,
+		"Uncorrectable memory (FRU)",
+		FM_EREPORT_PAYLOAD_MEMORY,
+		FM_EREPORT_CPU_USIII_FRU,
+
+	/* IVPE */
+	C_AFSR_IVPE,	"IVPE ",	ECC_C_TRAP,	CPU_IV,
+		"IVPE",
+		FM_EREPORT_PAYLOAD_SYSTEM1,
+		FM_EREPORT_CPU_USIII_IVPE,
+
+	0,		NULL,		0,		0,
+		NULL,
+		FM_EREPORT_PAYLOAD_UNKNOWN,
+		FM_EREPORT_CPU_USIII_UNKNOWN,
+};
+
+/*
+ * J_REQ overwrite policy (see UltraSPARC-IIIi PRM)
+ *
+ *   Class 4:  RUE, BP, WBP
+ *   Class 3:  RCE
+ *   Class 2:  TO, BERR
+ *   Class 1:  UMS
+ */
+uint64_t jreq_overwrite[] = {
+	C_AFSR_RUE | C_AFSR_BP | C_AFSR_WBP,
+	C_AFSR_RCE,
+	C_AFSR_TO | C_AFSR_BERR,
+	C_AFSR_UMS,
+	0
+};
+
+/*
+ * AGENT ID overwrite policy (see UltraSPARC-IIIi PRM)
+ *
+ *   Class 2:  CPU, FRU
+ *   Class 1:  CPC, FRC
+ */
+uint64_t jbus_aid_overwrite[] = {
+	C_AFSR_CPU | C_AFSR_FRU,
+	C_AFSR_CPC | C_AFSR_FRC,
+	0
+};
+
+int
+afsr_to_jaid_status(uint64_t afsr, uint64_t afsr_bit)
+{
+	return (afsr_to_overw_status(afsr, afsr_bit, jbus_aid_overwrite));
+}
+
+/*
+ * See UltraSPARC-IIIi+ PRM
+ *   Class 5:  ETS, ETU, EFES
+ *   Class 4:  UCC, UCU
+ *   Class 3:  UE, RUE, BP, WBP, EDU, WDU, CPU
+ *   Class 2:  CE, RCE, EDC, WDC, CPC, ETI, ETC
+ *   Class 1:  TO, BERR
+ *   Class 0:  UMS, OM
+ *
+ * See UltraSPARC-IIIi PRM
+ *   Class 5:  ETP
+ *   Class 4:  UCC, UCU
+ *   Class 3:  UE, RUE, BP, WBP, EDU, WDU
+ *   Class 2:  CE, RCE, EDC, WDC
+ *   Class 1:  TO, BERR
+ *   Class 0:  UMS, OM
+ */
+uint64_t afar_overwrite[] = {
+#if defined(JALAPENO)
+	C_AFSR_ETP,
+#elif defined(SERRANO)
+	C_AFSR_ETS | C_AFSR_ETU | C_AFSR_EFES,
+#endif /* SERRANO */
+	C_AFSR_UCC | C_AFSR_UCU,
+	C_AFSR_UE | C_AFSR_RUE | C_AFSR_BP | C_AFSR_WBP | C_AFSR_EDU |
+	    C_AFSR_WDU | C_AFSR_CPU,
+#if defined(SERRANO)
+	C_AFSR_ETI | C_AFSR_ETC |
+#endif /* SERRANO */
+	C_AFSR_CE | C_AFSR_RCE | C_AFSR_EDC | C_AFSR_WDC | C_AFSR_CPC,
+	C_AFSR_TO | C_AFSR_BERR,
+	C_AFSR_UMS | C_AFSR_OM,
+	0
+};
+
+#if defined(SERRANO)
+/*
+ * Serrano has a second AFAR that captures the physical address on
+ * FRC/FRU errors (which Jalapeno does not).  This register also
+ * captures the address for UE and CE errors.
+ *
+ * See UltraSPARC-IIIi+ PRM
+ *  Class 3: UE
+ *  Class 2: FRU
+ *  Class 1: CE
+ *  Class 0: FRC
+ */
+uint64_t afar2_overwrite[] = {
+	C_AFSR_UE,
+	C_AFSR_FRU,
+	C_AFSR_CE,
+	C_AFSR_FRC,
+	0
+};
+#endif  /* SERRANO */
+
+/*
+ * See UltraSPARC-IIIi PRM
+ *   Class 2:  UE, FRU, EDU, WDU, UCU, CPU
+ *   Class 1:  CE, FRC, EDC, WDC, UCC, CPC
+ */
+uint64_t esynd_overwrite[] = {
+#if defined(SERRANO)
+	C_AFSR_ETS | C_AFSR_ETU |
+#endif	/* SERRANO */
+	C_AFSR_UE | C_AFSR_FRU | C_AFSR_EDU | C_AFSR_WDU | C_AFSR_UCU |
+	    C_AFSR_CPU,
+	C_AFSR_CE | C_AFSR_FRC | C_AFSR_EDC | C_AFSR_WDC | C_AFSR_UCC |
+	    C_AFSR_CPC,
+	0
+};
+
+/*
+ * Prioritized list of Error bits for BSYND (referred to as
+ * MSYND to share code with CHEETAH & CHEETAH_PLUS) overwrite.
+ * See UltraSPARC-IIIi PRM
+ *   Class 3:  ISAP
+ *   Class 2:  BP
+ *   Class 1:  WBP, IVPE
+ */
+uint64_t msynd_overwrite[] = {
+	C_AFSR_ISAP,
+	C_AFSR_BP,
+	C_AFSR_WBP | C_AFSR_IVPE,
+	0
+};
+
+/*
+ * change cpu speed bits -- new speed will be normal-speed/divisor.
+ *
+ * The Jalapeno memory controllers are required to drain outstanding
+ * memory transactions within 32 JBus clocks in order to be ready
+ * to enter Estar mode.  In some corner cases however, that time
+ * fell short.
+ *
+ * A safe software solution is to force MCU to act like in Estar mode,
+ * then delay 1us (in ppm code) prior to assert J_CHNG_L signal.
+ * To reverse the effect, upon exiting Estar, software restores the
+ * MCU to its original state.
+ */
+/* ARGSUSED1 */
+void
+cpu_change_speed(uint64_t divisor, uint64_t arg2)
+{
+	bus_config_eclk_t *bceclk;
+	uint64_t		reg;
+	uint64_t		oldreg;
+	uint64_t		mreg;
+	uint64_t		val64;
+	int			id = (CPU)->cpu_id;
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	/*
+	 * ASI Ecache flush in 1/2 or 1/32 speed mode can result
+	 * in CPU fatal reset (JETO or IERR/TO on MP). A workaround
+	 * is to force the CPU to full speed mode prior to using
+	 * ASI Ecache flush opeartion to flush E$. Since we can't
+	 * always use cross calls at the time of flushing E$, we
+	 * cannot change other CPU speed. Hence, this workaround
+	 * is applicable to uniprocessor configuration only and
+	 * can't be used in multiprocessor configuration.
+	 *
+	 * Note that this workaround is activated only when the CPU
+	 * has been fully initialized and its speed is lowered by the
+	 * ppm for the first time. It can be disabled via /etc/system
+	 * by setting jp_errata_85_enable to 0 and rebooting the
+	 * system.
+	 */
+	if ((jp_errata_85_active == -1) &&
+	    jp_errata_85_enable &&
+	    (divisor != JBUS_CONFIG_ECLK_1_DIV)) {
+		if (ncpus == 1)
+			jp_errata_85_active = 1;
+		else
+			jp_errata_85_active = 0;
+	}
+	if ((!jp_errata_85_allow_slow_scrub) && (CPU_PRIVATE(CPU) != NULL)) {
+		int i;
+		ch_scrub_misc_t	*chpr_scrubp =
+		    CPU_PRIVATE_PTR(CPU, chpr_scrub_misc);
+
+		/* We're only allowed to run the scrubbers at full speed */
+
+		for (i = 0; i < CACHE_SCRUBBER_COUNT; i++) {
+			chpr_scrubp->chsm_enable[i] =
+			    (divisor == JBUS_CONFIG_ECLK_1_DIV);
+		}
+	}
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	/*
+	 * We're only interested in mcu_ctl_reg1 bit 26 and 25, of which
+	 * the value will be stored in the lower half of a byte.  The
+	 * top bit of this byte is designated as a valid bit - 0 means
+	 * invalid, 1 means valid.
+	 */
+	if (!mcu_fsm_init_state[id].valid) {
+		val64 = get_mcu_ctl_reg1() & JP_MCU_FSM_MASK;
+		mcu_fsm_init_state[id].state = val64 >> JP_MCU_FSM_SHIFT;
+		mcu_fsm_init_state[id].valid = 1;
+	}
+
+	for (bceclk = bus_config_eclk; bceclk->divisor; bceclk++) {
+		if (bceclk->divisor != divisor)
+			continue;
+		reg = get_jbus_config();
+		oldreg = reg;
+		reg &= ~JBUS_CONFIG_ECLK_MASK;
+		reg |= bceclk->mask;
+		set_jbus_config(reg);
+		(void) get_jbus_config();
+
+		/*
+		 * MCU workaround, refer to Jalapeno spec, EnergyStar section
+		 * for detail.
+		 */
+
+		/* Upon entering engery star mode, turn off extra MCU FSMs */
+		if (((oldreg & JBUS_CONFIG_ECLK_MASK) == JBUS_CONFIG_ECLK_1) &&
+		    ((divisor == JBUS_CONFIG_ECLK_2_DIV) ||
+		    (divisor == JBUS_CONFIG_ECLK_32_DIV))) {
+			mreg = get_mcu_ctl_reg1();
+			if ((mreg & JP_MCU_FSM_MASK) != 0) {
+				mreg &= ~JP_MCU_FSM_MASK;
+				set_mcu_ctl_reg1(mreg);
+				(void) get_mcu_ctl_reg1();
+			}
+		/* Upon exiting energy star mode, restore extra MCU FSMs */
+		} else if (divisor == JBUS_CONFIG_ECLK_1_DIV) {
+			mreg = get_mcu_ctl_reg1();
+			val64 = mcu_fsm_init_state[id].state;
+			mreg |= val64 << JP_MCU_FSM_SHIFT;
+			set_mcu_ctl_reg1(mreg);
+			(void) get_mcu_ctl_reg1();
+		}
+		CPU->cpu_m.divisor = (uchar_t)divisor;
+		return;
+	}
+	/*
+	 * We will reach here only if OBP and kernel don't agree on
+	 * the speeds supported by the CPU.
+	 */
+	cmn_err(CE_WARN, "cpu_change_speed: bad divisor %" PRIu64, divisor);
+}
+
+/*
+ * Cpu private initialization.  This includes allocating the cpu_private
+ * data structure, initializing it, and initializing the scrubber for this
+ * cpu.  This function calls cpu_init_ecache_scrub_dr to init the scrubber.
+ * We use kmem_cache_create for the cheetah private data structure because
+ * it needs to be allocated on a PAGESIZE (8192) byte boundary.
+ */
+void
+cpu_init_private(struct cpu *cp)
+{
+	cheetah_private_t *chprp;
+	int i;
+
+	ASSERT(CPU_PRIVATE(cp) == NULL);
+
+	/* LINTED: E_TRUE_LOGICAL_EXPR */
+	ASSERT((offsetof(cheetah_private_t, chpr_tl1_err_data) +
+	    sizeof (ch_err_tl1_data_t) * CH_ERR_TL1_TLMAX) <= PAGESIZE);
+
+#if defined(SERRANO)
+	if (!IS_SERRANO(cpunodes[cp->cpu_id].implementation)) {
+		cmn_err(CE_PANIC, "CPU%d: implementation 0x%x not supported"
+		    " on UltraSPARC-IIIi+ code\n", cp->cpu_id,
+		    cpunodes[cp->cpu_id].implementation);
+	}
+#else /* SERRANO */
+	if (!IS_JALAPENO(cpunodes[cp->cpu_id].implementation)) {
+		cmn_err(CE_PANIC, "CPU%d: implementation 0x%x not supported"
+		    " on UltraSPARC-IIIi code\n", cp->cpu_id,
+		    cpunodes[cp->cpu_id].implementation);
+	}
+#endif /* SERRANO */
+
+	/*
+	 * If the ch_private_cache has not been created, create it.
+	 */
+	if (ch_private_cache == NULL) {
+		ch_private_cache = kmem_cache_create("ch_private_cache",
+		    sizeof (cheetah_private_t), PAGESIZE, NULL, NULL,
+		    NULL, NULL, static_arena, 0);
+	}
+
+	chprp = CPU_PRIVATE(cp) = kmem_cache_alloc(ch_private_cache, KM_SLEEP);
+
+	bzero(chprp, sizeof (cheetah_private_t));
+	chprp->chpr_fecctl0_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	chprp->chpr_cecc_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	chprp->chpr_async_logout.clo_data.chd_afar = LOGOUT_INVALID;
+	for (i = 0; i < CH_ERR_TL1_TLMAX; i++)
+		chprp->chpr_tl1_err_data[i].ch_err_tl1_logout.clo_data.chd_afar
+		    = LOGOUT_INVALID;
+
+	chprp->chpr_icache_size = CH_ICACHE_SIZE;
+	chprp->chpr_icache_linesize = CH_ICACHE_LSIZE;
+
+	cpu_init_ecache_scrub_dr(cp);
+
+	chprp->chpr_ec_set_size = cpunodes[cp->cpu_id].ecache_size /
+	    cpu_ecache_nway();
+
+	adjust_hw_copy_limits(cpunodes[cp->cpu_id].ecache_size);
+	ch_err_tl1_paddrs[cp->cpu_id] = va_to_pa(chprp);
+	ASSERT(ch_err_tl1_paddrs[cp->cpu_id] != -1);
+}
+
+/*
+ * Clear the error state registers for this CPU.
+ * For Jalapeno, just clear the AFSR
+ */
+void
+set_cpu_error_state(ch_cpu_errors_t *cpu_error_regs)
+{
+	set_asyncflt(cpu_error_regs->afsr & ~C_AFSR_FATAL_ERRS);
+}
+
+/*
+ * Update cpu_offline_set so the scrubber knows which cpus are offline
+ */
+/*ARGSUSED*/
+int
+cpu_scrub_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
+{
+	switch (what) {
+	case CPU_ON:
+	case CPU_INIT:
+		CPUSET_DEL(cpu_offline_set, cpuid);
+		break;
+	case CPU_OFF:
+		CPUSET_ADD(cpu_offline_set, cpuid);
+		break;
+	default:
+		break;
+	}
+	return (0);
+}
diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno_asm.s b/usr/src/uts/sun4u/cpu/us3_jalapeno_asm.s
new file mode 100644
index 0000000000..1c88b57f7e
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_jalapeno_asm.s
@@ -0,0 +1,1039 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Assembly code support for the jalapeno module
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#if !defined(lint)
+#include "assym.h"
+#endif	/* lint */
+
+#include <sys/asm_linkage.h>
+#include <sys/mmu.h>
+#include <vm/hat_sfmmu.h>
+#include <sys/machparam.h>
+#include <sys/machcpuvar.h>
+#include <sys/machthread.h>
+#include <sys/machtrap.h>
+#include <sys/privregs.h>
+#include <sys/asm_linkage.h>
+#include <sys/trap.h>
+#include <sys/cheetahregs.h>
+#include <sys/us3_module.h>
+#include <sys/xc_impl.h>
+#include <sys/intreg.h>
+#include <sys/async.h>
+#include <sys/clock.h>
+#include <sys/cheetahasm.h>
+
+#ifdef TRAPTRACE
+#include <sys/traptrace.h>
+#endif /* TRAPTRACE */
+
+#if !defined(lint)
+
+/* BEGIN CSTYLED */
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+
+#define	CHK_JP_ERRATA85_ENABLED(scr, label)				\
+	ASM_LD(scr, jp_errata_85_active);				\
+	cmp	scr, 1;							\
+	bne	%icc, label;						\
+	nop
+
+#define	SET_64BIT_PA(dest, scr, hi32, lo32)				\
+	set	hi32, scr;						\
+	sllx	scr, 32, scr;						\
+	sethi	%hi(lo32), dest;					\
+	or	dest, %lo(lo32), dest;					\
+	or	scr, dest, dest
+
+/*
+ * Macro to trigger Jalapeno/Tomatillo speed change 
+ *   j_chng_pa - scratch register
+ *   scr - scratch register
+ */
+#define	JP_ESTAR_TRIGGER(j_chng_pa, scr)				\
+	SET_64BIT_PA(j_chng_pa, scr, TOM_HIGH_PA, M_T_J_CHNG_INIT_PA);	\
+	ldxa	[j_chng_pa]ASI_IO, scr;					\
+5:									\
+	and	scr, TOM_TRIGGER_MASK, scr;				\
+	cmp	scr, TOM_TRIGGER;					\
+	be,pt %icc, 5b;			/* wait while 10 */		\
+	ldxa	[j_chng_pa]ASI_IO, scr;					\
+	andn	scr, TOM_TRIGGER_MASK, scr;				\
+	stxa	scr, [j_chng_pa]ASI_IO;	/* clear j_chng[1:0] */		\
+	or	scr, TOM_TRIGGER, scr;					\
+	stxa	scr, [j_chng_pa]ASI_IO;	/* trigger j_chng */		\
+	ldxa	[j_chng_pa]ASI_IO, scr;					\
+6:									\
+	and	scr, TOM_TRIGGER_MASK, scr;				\
+	cmp	scr, TOM_TRIGGER;					\
+	be,pt %icc, 6b;			/* wait while 10 */		\
+	ldxa	[j_chng_pa]ASI_IO, scr;					\
+	andn	scr, TOM_TRIGGER_MASK, scr;				\
+	stxa	scr, [j_chng_pa]ASI_IO;	/* deassert j_chng */
+
+/*
+ * Macro to set Jalapeno CPU speed
+ *   speed - new speed constant
+ *   scr1  - scratch register
+ *   scr2  - scratch register
+ */
+#define	SET_JP_SPEED(speed, scr1, scr2)					\
+	ldxa	[%g0]ASI_JBUS_CONFIG, scr1;				\
+	set	JBUS_CONFIG_ECLK_MASK, scr2;				\
+	andn	scr1, scr2, scr1;					\
+	set	speed, scr2;						\
+	or	scr1, scr2, scr1;					\
+	stxa	scr1, [%g0]ASI_JBUS_CONFIG;
+
+/*
+ * macro to set Master Tomatillo speed
+ *   speed - tomatillo speed constant
+ *   tpa   - tomatillo estar control register PA
+ *   scr  - scratch register
+ */
+#define	SET_TOM_SPEED(speed, tpa, scr)					\
+	ldxa	[tpa]ASI_IO, scr;					\
+	andn	scr, TOM_ESTAR_ELCK_MASK, scr;				\
+	or	scr, speed, scr;					\
+	stxa	scr, [tpa]ASI_IO;
+
+/*
+ * macro to check and set Slave Tomatillo speed
+ *   speed - tomatillo speed constant
+ *   scr1   - scratch register
+ *   scr2   - scratch register
+ */
+
+#define	SET_SLAVE_T_SPEED(speed, scr1, scr2)				\
+	ldxa	[%g0]ASI_JBUS_CONFIG, scr2;				\
+	srlx	scr2, JBUS_SLAVE_T_PORT_BIT, scr2;			\
+	btst	1, scr2;						\
+	bz,pt	%icc, 4f;						\
+	nop;								\
+	SET_64BIT_PA(scr1, scr2, TOM_HIGH_PA, S_T_ESTAR_CTRL_PA);	\
+	SET_TOM_SPEED(speed, scr1, scr2);				\
+4:
+
+
+/*
+ * macro to adjust ASI_MCU_CTL_REG1[26:25] fsm bits according to
+ * new cpu speed: fsm[1:0]=11b for full speed, fsm[1:0]=0 for estar speed
+ *    value - fsm bit value constant
+ *    scr1  - scratch register
+ *    scr2  - scratch register
+ */
+#define	JP_ADJUST_FSM(value, scr1, scr2)				\
+	ldxa	[%g0]ASI_MCU_CTRL, scr1;				\
+	set	JP_MCU_FSM_MASK, scr2;					\
+	andn	scr1, scr2, scr1;					\
+	set	value, scr2;						\
+	or	scr1, scr2, scr1;					\
+	stxa	scr1, [%g0]ASI_MCU_CTRL;				\
+	membar	#Sync;
+
+/*
+ * JP_FORCE_FULL_SPEED and its fellow macros are for Jalapeno
+ * workstation to work around Errata 85. The front portion of
+ * it packs JP speed(14..13) and Tomatillo speed(5..0) into one
+ * register.
+ *
+ * Current code assumes that these two fields are non-overlapping.
+ * If that assumption changes, then this code won't work. If so, we
+ * force a compile time error by not defining the JP_FORCE_FULL_SPEED
+ * and JP_RESTORE_SPEED macros below.
+ */
+
+#if !(JBUS_CONFIG_ECLK_MASK & TOM_SPEED_MASK)
+
+/*
+ * Macro to force Jalapeno/Tomatillo to full speed
+ *   old_lvl - register used to save original cpu, tomatillo speed 
+ *   scr2 - scratch register
+ *   scr3 - scratch register
+ *   scr4 - scratch register
+ */
+#define	JP_FORCE_FULL_SPEED(old_lvl, scr2, scr3, scr4)			\
+	ldxa	[%g0]ASI_JBUS_CONFIG, old_lvl;				\
+	set	JBUS_CONFIG_ECLK_MASK, scr4;				\
+	and	old_lvl, scr4, old_lvl;					\
+	SET_64BIT_PA(scr2, scr3, TOM_HIGH_PA, M_T_ESTAR_CTRL_PA);	\
+	ldxa	[scr2]ASI_IO, scr3;					\
+	set	TOM_ESTAR_ELCK_MASK, scr4;				\
+	and	scr3, scr4, scr3;					\
+	or	old_lvl, scr3, old_lvl;					\
+	/* original jp and tomatillo speed saved in old_lvl */		\
+									\
+	/* either intended or currently at full speed */		\
+	set	JBUS_CONFIG_ECLK_MASK, scr4;				\
+	andcc	old_lvl, scr4, %g0;					\
+	bz,pt	%icc, 8f;						\
+	nop;								\
+	/* go through 1/2 speed. */					\
+	SET_JP_SPEED(JBUS_CONFIG_ECLK_2, scr3, scr4);			\
+	SET_TOM_SPEED(TOM_HALF_SPEED, scr2, scr3);			\
+	SET_SLAVE_T_SPEED(TOM_HALF_SPEED, scr3, scr4);			\
+	JP_ADJUST_FSM(0, scr3, scr4);					\
+	set	jp_estar_tl0_data, scr3;				\
+	ldx	[scr3], %g0;						\
+	membar	#Sync;		/* or busy wait 1us */			\
+	JP_ESTAR_TRIGGER(scr3, scr4);					\
+8:									\
+	/* bring to 1:1 speed */					\
+	SET_JP_SPEED(JBUS_CONFIG_ECLK_1, scr3, scr4);			\
+	SET_TOM_SPEED(TOM_FULL_SPEED, scr2, scr3);			\
+	SET_SLAVE_T_SPEED(TOM_FULL_SPEED, scr3, scr4);			\
+	JP_ADJUST_FSM(JP_MCU_FSM_MASK, scr3, scr4);			\
+	JP_ESTAR_TRIGGER(scr3, scr4)
+
+
+/*
+ * Macro to restore Jalapeno/Tomatillo to original speed
+ *     old_lvl - register contains saved original cpu, tomatillo speed 
+ *     scr2 - scratch register
+ *     scr3 - scratch register
+ *     scr4 - scratch register
+ *
+ * If trap had occured in the middle of ppm cpu speed transtion, then
+ * old_lvl[31:10] contains the intended new speed written into jbus_config.
+ * if old_lvl[9:0] is inconsistent with old_lvl[31:10], then the trap surely
+ * interrupted the ppm cpu speed transition, otherwise nothing for sure.  
+ * We'll restore the intended/then-current speed, that should cause no
+ * trouble to subsequent ppm cpu speed change code.
+ */
+#define	JP_RESTORE_SPEED(old_lvl, scr2, scr3, scr4)			\
+	srlx	old_lvl, JBUS_CONFIG_ECLK_SHIFT, scr2;			\
+	and	scr2, 3, scr2;						\
+	add	scr2, 1, scr2;						\
+	cmp	scr2, 3;						\
+	bne,pt	%icc, 7f;						\
+	  nop;								\
+	set	TOM_SLOW_SPEED, scr2;					\
+	/* scr2 contains tom speed according to intended jp speed */	\
+7:									\
+	andn	old_lvl, TOM_ESTAR_ELCK_MASK, old_lvl;			\
+	or	scr2, old_lvl, old_lvl;					\
+	/* updated old_lvl to contain intended jp and tom speed */	\
+	andcc	old_lvl, TOM_FULL_SPEED, %g0;				\
+	bnz,pt	%icc, 9f;	/* intended full, already at full */	\
+	nop;								\
+									\
+	/* go to half speed */						\
+	SET_JP_SPEED(JBUS_CONFIG_ECLK_2, scr3, scr4);			\
+	SET_64BIT_PA(scr2, scr3, TOM_HIGH_PA, M_T_ESTAR_CTRL_PA);	\
+	SET_TOM_SPEED(TOM_HALF_SPEED, scr2, scr3);			\
+	SET_SLAVE_T_SPEED(TOM_HALF_SPEED, scr3, scr4);			\
+	JP_ADJUST_FSM(0, scr3, scr4);					\
+	set	jp_estar_tl0_data, scr3;				\
+	ldx	[scr3], %g0;						\
+	membar	#Sync;							\
+	JP_ESTAR_TRIGGER(scr3, scr4);					\
+	andcc	old_lvl, TOM_SLOW_SPEED, %g0;				\
+	bz,pt	%icc, 9f;	/* intended 1:2, already at 1:2 */	\
+	  nop;								\
+									\
+	/* go to 1:32 speed */						\
+	SET_JP_SPEED(JBUS_CONFIG_ECLK_32, scr3, scr4);			\
+	SET_TOM_SPEED(TOM_SLOW_SPEED, scr2, scr3);			\
+	SET_SLAVE_T_SPEED(TOM_SLOW_SPEED, scr3, scr4);			\
+	JP_ESTAR_TRIGGER(scr3, scr4);					\
+9:
+
+#endif /* !(JBUS_CONFIG_ECLK_MASK & TOM_SPEED_MASK) */
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+/*
+ * Jalapeno version to reflush an Ecache line by index.
+ * Will flush all 4 ways (with only one scratch register).
+ * Note that the code will be faster if we use 2 scratch registers.
+ */
+#define	ECACHE_REFLUSH_LINE(ec_set_size, index, scr1)			\
+	JP_EC_DIAG_ACCESS_MEMBAR;					\
+	ldxa	[index]ASI_EC_DIAG, %g0;				\
+	JP_EC_DIAG_ACCESS_MEMBAR;					\
+	mov	1, scr1;						\
+	sllx	scr1, JP_ECFLUSH_EC_WAY_SHIFT, scr1;			\
+	add	scr1, index, scr1;					\
+	JP_EC_DIAG_ACCESS_MEMBAR;					\
+	ldxa	[scr1]ASI_EC_DIAG, %g0;					\
+	JP_EC_DIAG_ACCESS_MEMBAR;					\
+	mov	2, scr1;						\
+	sllx	scr1, JP_ECFLUSH_EC_WAY_SHIFT, scr1;			\
+	add	scr1, index, scr1;					\
+	JP_EC_DIAG_ACCESS_MEMBAR;					\
+	ldxa	[scr1]ASI_EC_DIAG, %g0;					\
+	JP_EC_DIAG_ACCESS_MEMBAR;					\
+	mov	3, scr1;						\
+	sllx	scr1, JP_ECFLUSH_EC_WAY_SHIFT, scr1;			\
+	add	scr1, index, scr1;					\
+	JP_EC_DIAG_ACCESS_MEMBAR;					\
+	ldxa	[scr1]ASI_EC_DIAG, %g0;					\
+	JP_EC_DIAG_ACCESS_MEMBAR
+
+/*
+ * Jalapeno version of ecache_flush_line.  Uses Jalapeno Ecache Displacement
+ * Flush feature to flush all 4 sets/ways.
+ */
+#define	ECACHE_FLUSH_LINE(physaddr, ec_set_size, scr1, scr2)		\
+	CPU_INDEX(scr1, scr2);						\
+	sllx	scr1, JP_ECFLUSH_PORTID_SHIFT, scr1;			\
+	set	JP_ECACHE_IDX_DISP_FLUSH, scr2;				\
+	or	scr2, scr1, scr2;					\
+	sub	ec_set_size, 1, scr1;					\
+	and	physaddr, scr1, scr1;					\
+	or	scr2, scr1, scr1;					\
+	ECACHE_REFLUSH_LINE(ec_set_size, scr1, scr2)
+
+/*
+ * Macro for getting ecache size from cpunodes structure
+ *  scr1:    Scratch, ecache size returned in this
+ *  scr2:    Scratch
+ */
+#define	GET_ECACHE_SIZE(scr1, scr2)					\
+	CPU_INDEX(scr1, scr2);						\
+	mulx	scr1, CPU_NODE_SIZE, scr1;				\
+	set	cpunodes + ECACHE_SIZE, scr2;				\
+	ld	[scr1 + scr2], scr1
+
+/* END CSTYLED */
+
+#endif	/* !lint */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+shipit(int upaid, int bn)
+{ return; }
+
+#else	/* lint */
+
+/*
+ * Ship mondo to aid using implicit busy/nack pair (bn ignored)
+ */
+	ENTRY_NP(shipit)
+	sll	%o0, IDCR_PID_SHIFT, %g1	! IDCR<18:14> = agent id
+	or	%g1, IDCR_OFFSET, %g1		! IDCR<13:0> = 0x70
+	stxa	%g0, [%g1]ASI_INTR_DISPATCH	! interrupt vector dispatch
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(shipit)
+
+#endif	/* lint */
+
+
+/*
+ * flush_ecache:
+ *	%o0 - 64 bit physical address
+ *	%o1 - ecache size
+ *	%o2 - ecache linesize
+ */
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+flush_ecache(uint64_t physaddr, size_t ecache_size, size_t ecache_linesize)
+{}
+
+#else /* !lint */
+
+	ENTRY(flush_ecache)
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g1, flush_ecache_1);
+	JP_FORCE_FULL_SPEED(%o3, %g1, %g2, %g3);	/* %o3: saved speed */
+flush_ecache_1:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	/*
+	 * Flush the entire Ecache using displacement flush.
+	 */
+	ECACHE_FLUSHALL(%o1, %o2, %o0, %o4)
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g1, flush_ecache_2);
+	JP_RESTORE_SPEED(%o3, %g1, %g2, %g3);		/* %o3: saved speed */
+flush_ecache_2:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	retl
+	nop
+	SET_SIZE(flush_ecache)
+
+#endif /* lint */
+
+
+#if defined(lint)
+
+void
+fast_ecc_err(void)
+{}
+
+#else	/* lint */
+
+	.section ".text"
+	.align	64
+	ENTRY_NP(fast_ecc_err)
+	
+	/*
+	 * Turn off CEEN and NCEEN.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g3
+	andn	%g3, EN_REG_NCEEN + EN_REG_CEEN, %g4
+	stxa	%g4, [%g0]ASI_ESTATE_ERR
+	membar	#Sync			! membar sync required
+
+	/*
+	 * Do the CPU log out capture.
+	 *   %g3 = "failed?" return value.
+	 *   %g2 = Input = AFAR. Output the clo_flags info which is passed
+	 *         into this macro via %g4. Output only valid if cpu_private
+	 *         struct has not been initialized.
+	 *   CHPR_FECCTL0_LOGOUT = cpu logout structure offset input
+	 *   %g4 = Trap information stored in the cpu logout flags field
+	 *   %g5 = scr1
+	 *   %g6 = scr2
+	 *   %g3 = scr3
+	 *   %g4 = scr4
+	 */
+	and	%g3, EN_REG_CEEN, %g4		! store the CEEN value, TL=0
+	set	CHPR_FECCTL0_LOGOUT, %g6
+	DO_CPU_LOGOUT(%g3, %g2, %g6, %g4, %g5, %g6, %g3, %g4)
+
+	/*
+	 * Flush the Ecache to get the error out of the Ecache.  If the UCC
+	 * or UCU is on a dirty line, then the following flush will turn
+	 * that into a WDC or WDU, respectively.
+	 */
+	CPU_INDEX(%g4, %g5)
+	mulx	%g4, CPU_NODE_SIZE, %g4
+	set	cpunodes, %g5
+	add	%g4, %g5, %g4
+	ld	[%g4 + ECACHE_LINESIZE], %g5
+	ld	[%g4 + ECACHE_SIZE], %g4
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g6, fast_ecc_err_1);
+        set     jp_estar_tl0_data, %g6
+        stx     %g2, [%g6 + 0]
+        stx     %g3, [%g6 + 8]
+	JP_FORCE_FULL_SPEED(%g2, %g3, %g6, %g7)		/* %g2: saved speed */
+fast_ecc_err_1:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+	ECACHE_FLUSHALL(%g4, %g5, %g6, %g7)
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g6, fast_ecc_err_2);
+	JP_RESTORE_SPEED(%g2, %g3, %g6, %g7)		/* %g2: saved speed */
+        set     jp_estar_tl0_data, %g6
+        ldx     [%g6 + 0], %g2
+        ldx     [%g6 + 8], %g3
+fast_ecc_err_2:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	/*
+	 * Flush the Dcache.  Since bad data could have been installed in
+	 * the Dcache we must flush it before re-enabling it.
+	 */
+	ASM_LD(%g5, dcache_size)
+	ASM_LD(%g6, dcache_linesize)
+	CH_DCACHE_FLUSHALL(%g5, %g6, %g7)
+
+	/*
+	 * Flush the Icache.  Since we turned off the Icache to capture the
+	 * Icache line it is now stale or corrupted and we must flush it
+	 * before re-enabling it.
+	 */
+	GET_CPU_PRIVATE_PTR(%g0, %g5, %g7, fast_ecc_err_4);
+	ld	[%g5 + CHPR_ICACHE_LINESIZE], %g6
+	ba,pt	%icc, 5f
+	  ld	[%g5 + CHPR_ICACHE_SIZE], %g5
+fast_ecc_err_4:
+	ASM_LD(%g5, icache_size)
+	ASM_LD(%g6, icache_linesize)
+5:
+	CH_ICACHE_FLUSHALL(%g5, %g6, %g7, %g4)
+
+	/*
+	 * Restore the Dcache and Icache to the previous state.
+	 */
+	stxa	%g1, [%g0]ASI_DCU
+	flush	%g0	/* flush required after changing the IC bit */
+
+	/*
+	 * Make sure our CPU logout operation was successful.
+	 */
+	cmp	%g3, %g0
+	be	8f
+	  nop
+
+	/*
+	 * If the logout structure had been busy, how many times have
+	 * we tried to use it and failed (nesting count)? If we have
+	 * already recursed a substantial number of times, then we can
+	 * assume things are not going to get better by themselves and
+	 * so it would be best to panic.
+	 */
+	cmp	%g3, CLO_NESTING_MAX
+	blt	7f
+	  nop
+
+        call ptl1_panic
+          mov   PTL1_BAD_ECC, %g1
+
+7:
+	/*
+	 * Otherwise, if the logout structure was busy but we have not
+	 * nested more times than our maximum value, then we simply
+	 * issue a retry. Our TL=0 trap handler code will check and
+	 * clear the AFSR after it is done logging what is currently
+	 * in the logout struct and handle this event at that time.
+	 */
+	retry
+8:
+	/*
+	 * Call cpu_fast_ecc_error via systrap at PIL 14 unless we're
+	 * already at PIL 15.
+	 */
+	set	cpu_fast_ecc_error, %g1
+	rdpr	%pil, %g4
+	cmp	%g4, PIL_14
+	ba	sys_trap
+	  movl	%icc, PIL_14, %g4
+
+	SET_SIZE(fast_ecc_err)
+
+#endif	/* lint */
+
+
+/*
+ * Fast ECC error at TL>0 handler
+ * We get here via trap 70 at TL>0->Software trap 0 at TL>0.  We enter
+ * this routine with %g1 and %g2 already saved in %tpc, %tnpc and %tstate.
+ * For a complete description of the Fast ECC at TL>0 handling see the
+ * comment block "Cheetah/Cheetah+ Fast ECC at TL>0 trap strategy" in
+ * us3_common_asm.s
+ */
+#if defined(lint)
+
+void
+fast_ecc_tl1_err(void)
+{}
+
+#else	/* lint */
+
+	.section ".text"
+	.align	64
+	ENTRY_NP(fast_ecc_tl1_err)
+
+	/*
+	 * This macro turns off the D$/I$ if they are on and saves their
+	 * original state in ch_err_tl1_tmp, saves all the %g registers in the
+	 * ch_err_tl1_data structure, updates the ch_err_tl1_flags and saves
+	 * the %tpc in ch_err_tl1_tpc.  At the end of this macro, %g1 will
+	 * point to the ch_err_tl1_data structure and the original D$/I$ state
+	 * will be saved in ch_err_tl1_tmp.  All %g registers except for %g1
+	 * will be available.
+	 */
+	CH_ERR_TL1_FECC_ENTER;
+
+	/*
+	 * Get the diagnostic logout data.  %g4 must be initialized to
+	 * current CEEN state, %g5 must point to logout structure in
+	 * ch_err_tl1_data_t.  %g3 will contain the nesting count upon
+	 * return.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g4
+	and	%g4, EN_REG_CEEN, %g4
+	add	%g1, CH_ERR_TL1_LOGOUT, %g5
+	DO_TL1_CPU_LOGOUT(%g3, %g2, %g4, %g5, %g6, %g3, %g4)
+
+	/*
+	 * If the logout nesting count is exceeded, we're probably
+	 * not making any progress, try to panic instead.
+	 */
+	cmp	%g3, CLO_NESTING_MAX
+	bge	fecc_tl1_err
+	  nop
+
+	/*
+	 * Save the current CEEN and NCEEN state in %g7 and turn them off
+	 * before flushing the Ecache.
+	 */
+	ldxa	[%g0]ASI_ESTATE_ERR, %g7
+	andn	%g7, EN_REG_CEEN | EN_REG_NCEEN, %g5
+	stxa	%g5, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/*
+	 * Flush the Ecache, using the largest possible cache size with the
+	 * smallest possible line size since we can't get the actual sizes
+	 * from the cpu_node due to DTLB misses.
+	 */
+	set	JP_ECACHE_MAX_SIZE, %g4
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g6, fast_ecc_tl1_err_1);
+        set     jp_estar_tl1_data, %g6
+        stx     %g2, [%g6 + 0]
+        stx     %g3, [%g6 + 8]
+	JP_FORCE_FULL_SPEED(%g2, %g3, %g5, %g6)
+fast_ecc_tl1_err_1:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+	ECACHE_FLUSHALL(%g4, JP_ECACHE_MAX_LSIZE, %g5, %g6)
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g6, fast_ecc_tl1_err_2);
+	JP_RESTORE_SPEED(%g2, %g3, %g5, %g6)
+        set     jp_estar_tl1_data, %g6
+        ldx     [%g6 + 0], %g2
+        ldx     [%g6 + 8], %g3
+fast_ecc_tl1_err_2:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	/*
+	 * Restore CEEN and NCEEN to the previous state.
+	 */
+	stxa	%g7, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/*
+	 * If we turned off the D$, then flush it and turn it back on.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_TMP]%asi, %g3
+	andcc	%g3, CH_ERR_TSTATE_DC_ON, %g0
+	bz	%xcc, 3f
+	  nop
+
+	/*
+	 * Flush the D$.
+	 */
+	ASM_LD(%g4, dcache_size)
+	ASM_LD(%g5, dcache_linesize)
+	CH_DCACHE_FLUSHALL(%g4, %g5, %g6)
+
+	/*
+	 * Turn the D$ back on.
+	 */
+	ldxa	[%g0]ASI_DCU, %g3
+	or	%g3, DCU_DC, %g3
+	stxa	%g3, [%g0]ASI_DCU
+	membar	#Sync
+3:
+	/*
+	 * If we turned off the I$, then flush it and turn it back on.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_TMP]%asi, %g3
+	andcc	%g3, CH_ERR_TSTATE_IC_ON, %g0
+	bz	%xcc, 4f
+	  nop
+
+	/*
+	 * Flush the I$.
+	 */
+	ASM_LD(%g4, icache_size)
+	ASM_LD(%g5, icache_linesize)
+	CH_ICACHE_FLUSHALL(%g4, %g5, %g6, %g3)
+
+	/*
+	 * Turn the I$ back on.  Changing DCU_IC requires flush.
+	 */
+	ldxa	[%g0]ASI_DCU, %g3
+	or	%g3, DCU_IC, %g3
+	stxa	%g3, [%g0]ASI_DCU
+	flush	%g0
+4:
+
+#ifdef TRAPTRACE
+	/*
+	 * Get current trap trace entry physical pointer.
+	 */
+	CPU_INDEX(%g6, %g5)
+	sll	%g6, TRAPTR_SIZE_SHIFT, %g6
+	set	trap_trace_ctl, %g5
+	add	%g6, %g5, %g6
+	ld	[%g6 + TRAPTR_LIMIT], %g5
+	tst	%g5
+	be	%icc, skip_traptrace
+	  nop
+	ldx	[%g6 + TRAPTR_PBASE], %g5
+	ld	[%g6 + TRAPTR_OFFSET], %g4
+	add	%g5, %g4, %g5
+
+	/*
+	 * Create trap trace entry.
+	 */
+	rd	%asi, %g7
+	wr	%g0, TRAPTR_ASI, %asi
+	rd	STICK, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TICK]%asi
+	rdpr	%tl, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TL]%asi
+	rdpr	%tt, %g4
+	stha	%g4, [%g5 + TRAP_ENT_TT]%asi
+	rdpr	%tpc, %g4
+	stna	%g4, [%g5 + TRAP_ENT_TPC]%asi
+	rdpr	%tstate, %g4
+	stxa	%g4, [%g5 + TRAP_ENT_TSTATE]%asi
+	stna	%sp, [%g5 + TRAP_ENT_SP]%asi
+	stna	%g0, [%g5 + TRAP_ENT_TR]%asi
+	wr	%g0, %g7, %asi
+	ldxa	[%g1 + CH_ERR_TL1_SDW_AFAR]%asi, %g3
+	ldxa	[%g1 + CH_ERR_TL1_SDW_AFSR]%asi, %g4
+	wr	%g0, TRAPTR_ASI, %asi
+	stna	%g3, [%g5 + TRAP_ENT_F1]%asi
+	stna	%g4, [%g5 + TRAP_ENT_F2]%asi
+	wr	%g0, %g7, %asi
+	ldxa	[%g1 + CH_ERR_TL1_AFAR]%asi, %g3
+	ldxa	[%g1 + CH_ERR_TL1_AFSR]%asi, %g4
+	wr	%g0, TRAPTR_ASI, %asi
+	stna	%g3, [%g5 + TRAP_ENT_F3]%asi
+	stna	%g4, [%g5 + TRAP_ENT_F4]%asi
+	wr	%g0, %g7, %asi
+
+	/*
+	 * Advance trap trace pointer.
+	 */
+	ld	[%g6 + TRAPTR_OFFSET], %g5
+	ld	[%g6 + TRAPTR_LIMIT], %g4
+	st	%g5, [%g6 + TRAPTR_LAST_OFFSET]
+	add	%g5, TRAP_ENT_SIZE, %g5
+	sub	%g4, TRAP_ENT_SIZE, %g4
+	cmp	%g5, %g4
+	movge	%icc, 0, %g5
+	st	%g5, [%g6 + TRAPTR_OFFSET]
+skip_traptrace:
+#endif	/* TRAPTRACE */
+
+	/*
+	 * If nesting count is not zero, skip all the AFSR/AFAR
+	 * handling and just do the necessary cache-flushing.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_NEST_CNT]%asi, %g2
+	brnz	%g2, 6f
+	  nop
+
+	/*
+	 * If a UCU followed by a WDU has occurred go ahead and panic
+	 * since a UE will occur (on the retry) before the UCU and WDU
+	 * messages are enqueued.
+	 */
+	ldxa	[%g1 + CH_ERR_TL1_AFSR]%asi, %g3
+	set	1, %g4
+	sllx	%g4, C_AFSR_UCU_SHIFT, %g4
+	btst	%g4, %g3		! UCU in original AFSR?
+	bz	%xcc, 6f
+	  nop
+	ldxa	[%g0]ASI_AFSR, %g4	! current AFSR
+	or	%g3, %g4, %g3		! %g3 = original + current AFSR
+	set	1, %g4
+	sllx	%g4, C_AFSR_WDU_SHIFT, %g4
+	btst	%g4, %g3		! WDU in original or current AFSR?
+	bnz	%xcc, fecc_tl1_err
+	  nop
+
+6:
+	/*
+	 * We fall into this macro if we've successfully logged the error in
+	 * the ch_err_tl1_data structure and want the PIL15 softint to pick
+	 * it up and log it.  %g1 must point to the ch_err_tl1_data structure.
+	 * Restores the %g registers and issues retry.
+	 */
+	CH_ERR_TL1_EXIT;
+	/*
+	 * Establish panic exit label.
+	 */
+	CH_ERR_TL1_PANIC_EXIT(fecc_tl1_err);
+
+	SET_SIZE(fast_ecc_tl1_err)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+
+uint64_t
+get_jbus_config(void)
+{ return (0); }
+
+/* ARGSUSED */
+void
+set_jbus_config(uint64_t jbus_config)
+{}
+
+/* ARGSUSED */
+void
+set_mcu_ctl_reg1(uint64_t mcu_ctl)
+{}
+
+uint64_t
+get_mcu_ctl_reg1(void)
+{ return (0); }
+
+#else	/* lint */
+
+	ENTRY(get_jbus_config)
+	ldxa	[%g0]ASI_JBUS_CONFIG, %o0
+	retl
+	nop
+	SET_SIZE(get_jbus_config)
+
+	ENTRY(set_jbus_config)
+	stxa	%o0, [%g0]ASI_JBUS_CONFIG
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(set_jbus_config)
+
+
+	ENTRY(get_mcu_ctl_reg1)
+	ldxa	[%g0]ASI_MCU_CTRL, %o0	! MCU control reg1 is at offset 0
+	retl
+	nop
+	SET_SIZE(get_mcu_ctl_reg1)
+
+
+	ENTRY(set_mcu_ctl_reg1)
+	stxa	%o0, [%g0]ASI_MCU_CTRL	! MCU control reg1 is at offset 0
+	membar	#Sync
+	retl
+	nop
+	SET_SIZE(set_mcu_ctl_reg1)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * scrubphys - Pass in the aligned physical memory address
+ * that you want to scrub, along with the ecache set size.
+ *
+ *	1) Displacement flush the E$ line corresponding to %addr.
+ *	   The first ldxa guarantees that the %addr is no longer in
+ *	   M, O, or E (goes to I or S (if instruction fetch also happens).
+ *	2) "Write" the data using a CAS %addr,%g0,%g0.
+ *	   The casxa guarantees a transition from I to M or S to M.
+ *	3) Displacement flush the E$ line corresponding to %addr.
+ *	   The second ldxa pushes the M line out of the ecache, into the
+ *	   writeback buffers, on the way to memory.
+ *	4) The "membar #Sync" pushes the cache line out of the writeback
+ *	   buffers onto the bus, on the way to dram finally.
+ *
+ * This is a modified version of the algorithm suggested by Gary Lauterbach.
+ * In theory the CAS %addr,%g0,%g0 is supposed to mark the addr's cache line
+ * as modified, but then we found out that for spitfire, if it misses in the
+ * E$ it will probably install as an M, but if it hits in the E$, then it
+ * will stay E, if the store doesn't happen. So the first displacement flush
+ * should ensure that the CAS will miss in the E$.  Arrgh.
+ */
+/* ARGSUSED */
+void
+scrubphys(uint64_t paddr, int ecache_set_size)
+{}
+
+#else	/* lint */
+	ENTRY(scrubphys)
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate	! clear IE, AM bits
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g1, scrubphys_1);
+	JP_FORCE_FULL_SPEED(%o5, %g1, %g2, %g3)		/* %o5: saved speed */
+scrubphys_1:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	ECACHE_REFLUSH_LINE(%o1, %o2, %o3)
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g1, scrubphys_2);
+	JP_RESTORE_SPEED(%o5, %g1, %g2, %g3)		/* %o5: saved speed */
+scrubphys_2:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	wrpr	%g0, %o4, %pstate	! restore earlier pstate register value
+
+	retl
+	membar	#Sync			! move the data out of the load buffer
+	SET_SIZE(scrubphys)
+
+#endif	/* lint */
+
+
+#if defined(lint)
+/*
+ * clearphys - Pass in the aligned physical memory address
+ * that you want to push out, as a ecache_linesize byte block of zeros,
+ * from the ecache zero-filled.
+ */
+/* ARGSUSED */
+void
+clearphys(uint64_t paddr, int ecache_set_size, int ecache_linesize)
+{
+}
+
+#else	/* lint */
+	ENTRY(clearphys)
+	/* turn off IE, AM bits */
+	rdpr	%pstate, %o4
+	andn	%o4, PSTATE_IE | PSTATE_AM, %o5
+	wrpr	%o5, %g0, %pstate
+
+	/* turn off NCEEN */
+	ldxa	[%g0]ASI_ESTATE_ERR, %o5
+	andn	%o5, EN_REG_NCEEN, %o3
+	stxa	%o3, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/* zero the E$ line */
+1:
+	subcc	%o2, 8, %o2
+	bge	1b
+	  stxa	%g0, [%o0 + %o2]ASI_MEM
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g1, clearphys_1);
+	JP_FORCE_FULL_SPEED(%o5, %g1, %g2, %g3)		/* %o5: saved speed */
+clearphys_1:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+	casxa	[%o0]ASI_MEM, %g0, %g0
+	ECACHE_REFLUSH_LINE(%o1, %o2, %o3)
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g1, clearphys_2);
+	JP_RESTORE_SPEED(%o5, %g1, %g2, %g3)		/* %o5: saved speed */
+clearphys_2:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	/* clear the AFSR */
+	ldxa	[%g0]ASI_AFSR, %o1
+	stxa	%o1, [%g0]ASI_AFSR
+	membar	#Sync
+
+	/* turn NCEEN back on */
+	stxa	%o5, [%g0]ASI_ESTATE_ERR
+	membar	#Sync
+
+	/* return and re-enable IE and AM */
+	retl
+	  wrpr	%g0, %o4, %pstate
+	SET_SIZE(clearphys)
+
+#endif	/* lint */
+	
+
+#if defined(lint)
+/*
+ * Jalapeno Ecache displacement flush the specified line from the E$
+ *
+ * Register usage:
+ *	%o0 - 64 bit physical address for flushing
+ *	%o1 - Ecache set size
+ */
+/*ARGSUSED*/
+void
+ecache_flush_line(uint64_t flushaddr, int ec_set_size)
+{
+}
+#else	/* lint */
+	ENTRY(ecache_flush_line)
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g1, ecache_flush_line_1);
+	JP_FORCE_FULL_SPEED(%o5, %g1, %g2, %g3)		/* %o5: saved speed */
+ecache_flush_line_1:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	ECACHE_FLUSH_LINE(%o0, %o1, %o2, %o3)
+
+#if defined(JALAPENO) && defined(JALAPENO_ERRATA_85)
+	CHK_JP_ERRATA85_ENABLED(%g1, ecache_flush_line_2);
+	JP_RESTORE_SPEED(%o5, %g1, %g2, %g3)		/* %o5: saved speed */
+ecache_flush_line_2:
+#endif	/* JALAPENO && JALAPENO_ERRATA_85 */
+
+	retl
+	  nop
+	SET_SIZE(ecache_flush_line)
+#endif	/* lint */
+
+
+/*
+ * Perform necessary cpu workaround to ensure jbus ordering.
+ * Called only from Fire systems.
+ * CPU's internal "invalidate FIFOs" are flushed.
+ */
+
+#if defined(lint)
+void
+jbus_stst_order()
+{}
+#else	/* lint */
+
+#define	VIS_BLOCKSIZE	64
+
+	.seg    ".data"
+	.align  VIS_BLOCKSIZE
+	.type   sync_buf, #object
+sync_buf:
+	.skip   VIS_BLOCKSIZE
+	.size   sync_buf, VIS_BLOCKSIZE
+
+	ENTRY(jbus_stst_order)
+	set	sync_buf, %o1
+
+	rd	%fprs, %o2			! %o2 = saved fprs
+	or	%o2, FPRS_FEF, %o3
+	wr	%g0, %o3, %fprs			! make sure fp is enabled
+	stda    %d0, [%o1]ASI_BLK_COMMIT_P
+	wr	%o2, 0, %fprs			! restore fprs
+
+	retl
+	membar  #Sync
+	SET_SIZE(jbus_stst_order)
+
+#endif	/* lint */
+
+#if defined(lint)
+/*
+ * This routine will not be called in Jalapeno systems.
+ */
+void
+flush_ipb(void)
+{ return; }
+
+#else	/* lint */
+
+	ENTRY(flush_ipb)
+	retl
+	nop
+	SET_SIZE(flush_ipb)
+
+#endif	/* lint */
diff --git a/usr/src/uts/sun4u/cpu/us3_kdi.c b/usr/src/uts/sun4u/cpu/us3_kdi.c
new file mode 100644
index 0000000000..245da42b51
--- /dev/null
+++ b/usr/src/uts/sun4u/cpu/us3_kdi.c
@@ -0,0 +1,158 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * CPU-specific functions needed by the Kernel-Debugger Interface (KDI).  These
+ * functions are invoked directly by the kernel debugger (kmdb) while the system
+ * has been stopped, and as such must not use any kernel facilities that block
+ * or otherwise rely on forward progress by other parts of the kernel.
+ *
+ * These functions may also be called before unix`_start, and as such cannot
+ * use any kernel facilities that must be initialized as part of system start.
+ * An example of such a facility is drv_usecwait(), which relies on a parameter
+ * that is initialized by the unix module.  As a result, drv_usecwait() may not
+ * be used by KDI functions.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/archsystm.h>
+#include <sys/machsystm.h>
+#include <sys/cpu_module.h>
+#include <sys/xc_impl.h>
+#include <sys/intreg.h>
+#include <sys/kdi_impl.h>
+
+/*
+ * We keep our own copies, used for cache flushing, because we can be called
+ * before cpu_fiximpl().
+ */
+static int kdi_dcache_size;
+static int kdi_dcache_linesize;
+static int kdi_icache_size;
+static int kdi_icache_linesize;
+
+/*
+ * Assembly support for cheetah modules in cheetah_asm.s
+ */
+extern int idsr_busy(void);
+extern void init_mondo_nocheck(xcfunc_t *func, uint64_t arg1, uint64_t arg2);
+extern void shipit(int, int);
+extern void kdi_flush_idcache(int, int, int, int);
+extern int kdi_get_stick(uint64_t *);
+
+static int
+kdi_cpu_ready_iter(int (*cb)(int, void *), void *arg)
+{
+	int rc, i;
+
+	for (rc = 0, i = 0; i < NCPU; i++) {
+		if (CPU_IN_SET(cpu_ready_set, i))
+			rc += cb(i, arg);
+	}
+
+	return (rc);
+}
+
+/*
+ * Sends a cross-call to a specified processor.  The caller assumes
+ * responsibility for repetition of cross-calls, as appropriate (MARSA for
+ * debugging).
+ */
+static int
+kdi_xc_one(int cpuid, void (*func)(uintptr_t, uintptr_t), uintptr_t arg1,
+    uintptr_t arg2)
+{
+	uint64_t idsr;
+	uint64_t busymask;
+
+	/*
+	 * if (idsr_busy())
+	 *	return (KDI_XC_RES_ERR);
+	 */
+
+	init_mondo_nocheck((xcfunc_t *)func, arg1, arg2);
+
+	shipit(cpuid, 0);
+
+#if defined(JALAPENO) || defined(SERRANO)
+	/*
+	 * Lower 2 bits of the agent ID determine which BUSY/NACK pair
+	 * will be used for dispatching interrupt. For now, assume
+	 * there are no more than IDSR_BN_SETS CPUs, hence no aliasing
+	 * issues with respect to BUSY/NACK pair usage.
+	 */
+	busymask  = IDSR_BUSY_BIT(cpuid);
+#else /* JALAPENO || SERRANO */
+	busymask = IDSR_BUSY;
+#endif /* JALAPENO || SERRANO */
+
+	if ((idsr = getidsr()) == 0)
+		return (KDI_XC_RES_OK);
+	else if (idsr & busymask)
+		return (KDI_XC_RES_BUSY);
+	else
+		return (KDI_XC_RES_NACK);
+}
+
+static void
+kdi_tickwait(clock_t nticks)
+{
+	clock_t endtick = gettick() + nticks;
+
+	while (gettick() < endtick);
+}
+
+static void
+kdi_cpu_init(int dcache_size, int dcache_linesize, int icache_size,
+    int icache_linesize)
+{
+	kdi_dcache_size = dcache_size;
+	kdi_dcache_linesize = dcache_linesize;
+	kdi_icache_size = icache_size;
+	kdi_icache_linesize = icache_linesize;
+}
+
+/* used directly by kdi_read/write_phys */
+void
+kdi_flush_caches(void)
+{
+	kdi_flush_idcache(kdi_dcache_size, kdi_dcache_linesize,
+	    kdi_icache_size, kdi_icache_linesize);
+}
+
+void
+cpu_kdi_init(kdi_t *kdi)
+{
+	kdi->kdi_flush_caches = kdi_flush_caches;
+	kdi->mkdi_cpu_init = kdi_cpu_init;
+	kdi->mkdi_cpu_ready_iter = kdi_cpu_ready_iter;
+	kdi->mkdi_xc_one = kdi_xc_one;
+	kdi->mkdi_tickwait = kdi_tickwait;
+	kdi->mkdi_get_stick = kdi_get_stick;
+}