1 files changed, 1053 insertions, 0 deletions
diff --git a/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s b/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s
new file mode 100644
index 0000000000..5b8bbff7cc
--- /dev/null
+++ b/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s
@@ -0,0 +1,1053 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+	.file	"memcpy.s"
+
+/*
+ * memcpy(s1, s2, len)
+ *
+ * Copy s2 to s1, always copy n bytes.
+ * Note: this C code does not work for overlapped copies.
+ *       Memmove() and bcopy() do.
+ *
+ * Fast assembler language version of the following C-program for memcpy
+ * which represents the `standard' for the C-library.
+ *
+ *	void * 
+ *	memcpy(void *s, const void *s0, size_t n)
+ *	{
+ *		if (n != 0) {
+ *	   	    char *s1 = s;
+ *		    const char *s2 = s0;
+ *		    do {
+ *			*s1++ = *s2++;
+ *		    } while (--n != 0);
+ *		}
+ *		return (s);
+ *	}
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/sun4asi.h>
+#include <sys/trap.h>
+
+#define	ICACHE_LINE_SIZE	64
+#define	BLOCK_SIZE	64
+#define	FPRS_FEF	0x4
+
+#define SHORTCOPY	3
+#define	SMALL_MAX	39
+#define	MEDIUM_MAX	255
+#define MED_WMAX	256	/* max copy for medium word-aligned case */
+#define MED_MAX		256	/* max copy for medium longword-aligned case */
+
+#ifndef BSTORE_SIZE
+#define BSTORE_SIZE	256	/* min copy size for block store */
+#endif
+
+	ANSI_PRAGMA_WEAK(memmove,function)
+	ANSI_PRAGMA_WEAK(memcpy,function)
+
+	ENTRY(memmove)
+	cmp	%o1, %o0	! if from address is >= to use forward copy
+	bgeu	%ncc, .forcpy	! else use backward if ...
+	sub	%o0, %o1, %o4	! get difference of two addresses
+	cmp	%o2, %o4	! compare size and difference of addresses
+	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
+	nop
+
+        !
+        ! an overlapped copy that must be done "backwards"
+        !
+.ovbc:
+	mov	%o0, %g1		! save dest address for return val
+	add     %o1, %o2, %o1           ! get to end of source space
+        add     %o0, %o2, %o0           ! get to end of destination space
+
+	cmp	%o2, 24
+	bgeu,pn	%ncc, .dbalign
+	nop
+	cmp	%o2, 4
+	blt,pn	%ncc, .byte
+	sub	%o2, 3, %o2
+.byte4loop:
+	ldub	[%o1-1], %o3		! load last byte
+	stb	%o3, [%o0-1]		! store last byte
+	sub	%o1, 4, %o1
+	ldub	[%o1+2], %o3		! load 2nd from last byte
+	stb	%o3, [%o0-2]		! store 2nd from last byte
+	sub	%o0, 4, %o0
+	ldub	[%o1+1], %o3		! load 3rd from last byte
+	stb	%o3, [%o0+1]		! store 3rd from last byte
+	subcc	%o2, 4, %o2
+	ldub	[%o1], %o3		! load 4th from last byte
+	bgu,pt	%ncc, .byte4loop
+	stb	%o3, [%o0]		! store 4th from last byte
+.byte:
+	addcc	%o2, 3, %o2
+	bz,pt	%ncc, .exit
+.byteloop:
+	dec	%o1			! decrement src address
+	ldub	[%o1], %o3		! read a byte
+	dec	%o0			! decrement dst address
+	deccc	%o2			! decrement count
+	bgu,pt	%ncc, .byteloop		! loop until done
+	stb	%o3, [%o0]		! write byte
+.exit:
+	retl
+	mov	%g1, %o0
+
+	.align	16
+.dbalign:
+	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
+	bz,pt	%ncc, .dbmed
+	sub	%o2, %o5, %o2		! update count
+.dbalign1:
+	dec	%o1			! decrement src address
+	ldub	[%o1], %o3		! read a byte
+	dec	%o0			! decrement dst address
+	deccc	%o5			! decrement count
+	bgu,pt	%ncc, .dbalign1		! loop until done
+	stb	%o3, [%o0]		! store a byte
+
+! check for src long word alignment
+.dbmed:
+	andcc	%o1, 7, %g0		! chk src long word alignment
+	bnz,pn	%ncc, .dbbck
+	nop
+!
+! Following code is for overlapping copies where src and dest
+! are long word aligned
+!
+	cmp	%o2, 4095
+	blt,pn	%ncc, .dbmedl32enter	! go to no prefetch code
+	nop
+	prefetch [%o1 - (1 * BLOCK_SIZE)], 20	! into the prefetch cache
+	sub	%o2, 63, %o2		! adjust length to allow cc test
+					! for end of loop
+	prefetch [%o1 - (2 * BLOCK_SIZE)], 20	! into the prefetch cache
+	rd	%fprs, %o3		! o3 = fprs
+	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
+	! So set it anyway, without checking.
+	prefetch [%o1 - (3 * BLOCK_SIZE)], 20	! into the prefetch cache
+	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
+	prefetch [%o1 - (4 * BLOCK_SIZE)], 20	! into the prefetch cache
+.dbmedl64:
+	prefetch [%o1 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
+	ldd	[%o1-8], %d4		! load
+	subcc	%o2, 64, %o2		! decrement length count
+	std	%d4, [%o0-8]		! and store
+	ldd	[%o1-16], %d2		! a block of 64 bytes
+	sub	%o1, 64, %o1		! decrease src ptr by 64
+	std	%d2, [%o0-16]
+	sub	%o0, 64, %o0		! decrease dst ptr by 64
+	ldd	[%o1+40], %d4
+	std	%d4, [%o0+40]
+	ldd	[%o1+32], %d2
+	std	%d2, [%o0+32]
+	ldd	[%o1+24], %d4
+	std	%d4, [%o0+24]
+	ldd	[%o1+16], %d2
+	std	%d2, [%o0+16]
+	ldd	[%o1+8], %d4
+	std	%d4, [%o0+8]
+	ldd	[%o1], %d2
+	bgu,pt	%ncc, .dbmedl64		! repeat if at least 64 bytes left
+	std	%d2, [%o0]
+	add	%o2, 63, %o2		! restore offset adjustment
+	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
+	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs	
+.dbmedl32enter:
+	subcc	%o2, 31, %o2		! adjust length to allow cc test
+					! for end of loop
+	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
+	nop
+.dbmedl32:
+	ldx	[%o1-8], %o4		! load
+	subcc	%o2, 32, %o2		! decrement length count
+	stx	%o4, [%o0-8]		! and store
+	ldx	[%o1-16], %o3		! a block of 32 bytes
+	sub	%o1, 32, %o1		! decrease src ptr by 32
+	stx	%o3, [%o0-16]
+	ldx	[%o1+8], %o4
+	sub	%o0, 32, %o0		! decrease dst ptr by 32
+	stx	%o4, [%o0+8]
+	ldx	[%o1], %o3
+	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
+	stx	%o3, [%o0]
+.dbmedl31:
+	addcc	%o2, 16, %o2		! adjust remaining count
+	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
+	nop				!
+	ldx	[%o1-8], %o4		! load and store 16 bytes
+	sub	%o1, 16, %o1		! decrease src ptr by 16
+	stx	%o4, [%o0-8]		!
+	sub	%o2, 16, %o2		! decrease count by 16
+	ldx	[%o1], %o3		!
+	sub	%o0, 16, %o0		! decrease dst ptr by 16
+	stx	%o3, [%o0]
+.dbmedl15:
+	addcc	%o2, 15, %o2		! restore count
+	bz,pt	%ncc, .dbexit		! exit if finished
+	nop
+	cmp	%o2, 8
+	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
+	nop
+	ldx	[%o1-8], %o4		! load 8 bytes
+	sub	%o1, 8, %o1		! decrease src ptr by 8
+	stx	%o4, [%o0-8]		! and store 8 bytes
+	subcc	%o2, 8, %o2		! decrease count by 8
+	bnz	%ncc, .dbremain		! exit if finished
+	sub	%o0, 8, %o0		! decrease dst ptr by 8
+	retl
+	mov	%g1, %o0
+
+!
+! Following code is for overlapping copies where src and dest
+! are not long word aligned
+!
+	.align	16
+.dbbck:
+	rd	%fprs, %o3		! o3 = fprs
+ 
+	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
+	! So set it anyway, without checking.
+	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
+
+	alignaddr %o1, %g0, %o5		! align src
+	ldd	[%o5], %d0		! get first 8 byte block
+	andn	%o2, 7, %o4		! prepare src ptr for finishup code
+	cmp	%o2, 32
+	blt,pn	%ncc, .dbmv8
+	sub	%o1, %o4, %o1		!
+	cmp	%o2, 4095		! check for short memmoves
+	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
+.dbmv64:
+	ldd	[%o5-8], %d2		! load 8 bytes
+	ldd	[%o5-16], %d4		! load 8 bytes
+	sub	%o5, 64, %o5		!
+	ldd	[%o5+40], %d6		! load 8 bytes
+	sub	%o0, 64, %o0		!
+	ldd	[%o5+32], %d8		! load 8 bytes
+	sub	%o2, 64, %o2		! 64 less bytes to copy
+	ldd	[%o5+24], %d18		! load 8 bytes
+	cmp	%o2, 64			! do we have < 64 bytes remaining
+	ldd	[%o5+16], %d28		! load 8 bytes
+	ldd	[%o5+8], %d30		! load 8 bytes
+	prefetch [%o5 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
+	faligndata %d2, %d0, %d10	! extract 8 bytes out
+	ldd	[%o5], %d0		! load 8 bytes
+	std	%d10, [%o0+56]		! store the current 8 bytes
+	faligndata %d4, %d2, %d12	! extract 8 bytes out
+	std	%d12, [%o0+48]		! store the current 8 bytes
+	faligndata %d6, %d4, %d14	! extract 8 bytes out
+	std	%d14, [%o0+40]		! store the current 8 bytes
+	faligndata %d8, %d6, %d16	! extract 8 bytes out
+	std	%d16, [%o0+32]		! store the current 8 bytes
+	faligndata %d18, %d8, %d20	! extract 8 bytes out
+	std	%d20, [%o0+24]		! store the current 8 bytes
+	faligndata %d28, %d18, %d22	! extract 8 bytes out
+	std	%d22, [%o0+16]		! store the current 8 bytes
+	faligndata %d30, %d28, %d24	! extract 8 bytes out
+	std	%d24, [%o0+8]		! store the current 8 bytes
+	faligndata %d0, %d30, %d26	! extract 8 bytes out
+	bgeu,pt	%ncc, .dbmv64
+	std	%d26, [%o0]		! store the current 8 bytes
+
+	cmp	%o2, 32
+	blt,pn	%ncc, .dbmvx
+	nop
+.dbmv32:
+	ldd	[%o5-8], %d2		! load 8 bytes
+.dbmv32enter:
+	ldd	[%o5-16], %d4		! load 8 bytes
+	sub	%o5, 32, %o5		!
+	ldd	[%o5+8], %d6		! load 8 bytes
+	sub	%o0, 32, %o0		! 
+	faligndata %d2, %d0, %d10	! extract 8 bytes out
+	ldd	[%o5], %d0		! load 8 bytes
+	sub     %o2,32, %o2		! 32 less bytes to copy
+	std	%d10, [%o0+24]		! store the current 8 bytes
+	cmp	%o2, 32			! do we have < 32 bytes remaining
+	faligndata %d4, %d2, %d12	! extract 8 bytes out
+	std	%d12, [%o0+16]		! store the current 8 bytes
+	faligndata %d6, %d4, %d14	! extract 8 bytes out
+	std	%d14, [%o0+8]		! store the current 8 bytes
+	faligndata %d0, %d6, %d16	! extract 8 bytes out
+	bgeu,pt	%ncc, .dbmv32
+	std	%d16, [%o0]		! store the current 8 bytes
+.dbmvx:
+	cmp	%o2, 8			! do we have < 8 bytes remaining
+	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
+	nop
+.dbmv8:
+	ldd	[%o5-8], %d2
+	sub	%o0, 8, %o0		! since we are at the end
+					! when we first enter the loop
+	sub     %o2, 8, %o2		! 8 less bytes to copy
+	sub	%o5, 8, %o5
+	cmp	%o2, 8			! do we have < 8 bytes remaining
+	faligndata %d2, %d0, %d8        ! extract 8 bytes out
+	std	%d8, [%o0]		! store the current 8 bytes
+	bgeu,pt	%ncc, .dbmv8
+	fmovd	%d2, %d0
+.dbmvfinish:
+	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
+	tst	%o2
+	bz,pt	%ncc, .dbexit
+	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs
+
+.dbremain:
+	cmp	%o2, 4
+	blt,pn	%ncc, .dbbyte
+	nop
+	ldub	[%o1-1], %o3		! load last byte
+	stb	%o3, [%o0-1]		! store last byte
+	sub	%o1, 4, %o1
+	ldub	[%o1+2], %o3		! load 2nd from last byte
+	stb	%o3, [%o0-2]		! store 2nd from last byte
+	sub	%o0, 4, %o0
+	ldub	[%o1+1], %o3		! load 3rd from last byte
+	stb	%o3, [%o0+1]		! store 3rd from last byte
+	subcc	%o2, 4, %o2
+	ldub	[%o1], %o3		! load 4th from last byte
+	stb	%o3, [%o0]		! store 4th from last byte	
+	bz,pt	%ncc, .dbexit
+.dbbyte:
+	dec	%o1			! decrement src address
+	ldub	[%o1], %o3		! read a byte
+	dec	%o0			! decrement dst address
+	deccc	%o2			! decrement count
+	bgu,pt	%ncc, .dbbyte		! loop until done
+	stb	%o3, [%o0]		! write byte
+.dbexit:
+	retl
+        mov     %g1, %o0
+	SET_SIZE(memmove)
+
+
+	.align ICACHE_LINE_SIZE
+	ENTRY(memcpy)
+					! adjust instruction alignment
+	nop				! Do not remove, these nops affect
+	nop				! icache alignment and performance
+.forcpy:
+	cmp	%o2, SMALL_MAX		! check for not small case
+	bgu,pn	%ncc, .medium		! go to larger cases
+	mov	%o0, %g1		! save %o0
+	cmp	%o2, SHORTCOPY		! check for really short case
+	ble,pt	%ncc, .smallleft	!
+	or	%o0, %o1, %o3		! prepare alignment check
+	andcc	%o3, 0x3, %g0		! test for alignment
+	bz,pt	%ncc, .smallword	! branch to word aligned case
+	sub	%o2, 3, %o2		! adjust count to allow cc zero test
+.smallnotalign4:
+	ldub	[%o1], %o3		! read byte
+	subcc	%o2, 4, %o2		! reduce count by 4
+	stb	%o3, [%o0]		! write byte
+	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
+	add	%o1, 4, %o1		! advance SRC by 4
+	stb	%o3, [%o0+1]
+	ldub	[%o1-2], %o3
+	add	%o0, 4, %o0		! advance DST by 4
+	stb	%o3, [%o0-2]
+	ldub	[%o1-1], %o3
+	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
+	stb	%o3, [%o0-1]
+	add	%o2, 3, %o2		! restore count
+.smallleft:
+	tst	%o2
+	bz,pt	%ncc, .smallexit
+	nop
+.smallleft3:				! 1, 2, or 3 bytes remain
+	ldub	[%o1], %o3		! load one byte
+	deccc	%o2			! reduce count for cc test
+	bz,pt	%ncc, .smallexit
+	stb	%o3, [%o0]		! store one byte
+	ldub	[%o1+1], %o3		! load second byte
+	deccc	%o2
+	bz,pt	%ncc, .smallexit
+	stb	%o3, [%o0+1]		! store second byte
+	ldub	[%o1+2], %o3		! load third byte
+	stb	%o3, [%o0+2]		! store third byte
+	retl
+	mov	%g1, %o0		! restore %o0
+
+	.align	16
+	nop				! affects loop icache alignment
+.smallwords:
+	lduw	[%o1], %o3		! read word
+.smallwordx:
+	subcc	%o2, 8, %o2		! update count
+	stw	%o3, [%o0]		! write word
+	add	%o1, 8, %o1		! update SRC
+	lduw	[%o1-4], %o3		! read word
+	add	%o0, 8, %o0		! update DST
+	bgu,pt	%ncc, .smallwords	! loop until done
+	stw	%o3, [%o0-4]		! write word
+	addcc	%o2, 7, %o2		! restore count
+	bz,pt	%ncc, .smallexit	! check for completion
+	nop
+	cmp	%o2, 4			! check for 4 or more bytes left
+	blt	.smallleft3		! if not, go to finish up
+	nop
+	lduw	[%o1], %o3
+	add	%o1, 4, %o1
+	subcc	%o2, 4, %o2
+	stw	%o3, [%o0]
+	add	%o0, 4, %o0
+	bnz,pt	%ncc, .smallleft3
+	nop
+	retl
+	mov	%g1, %o0		! restore %o0
+
+.smallword:
+	subcc	%o2, 4, %o2		! update count
+	bgu,pt	%ncc, .smallwordx
+	lduw	[%o1], %o3		! read word
+	addcc	%o2, 3, %o2		! restore count
+	bz,pt	%ncc, .smallexit
+	stw	%o3, [%o0]		! write word
+	deccc	%o2			! reduce count for cc test
+	ldub	[%o1+4], %o3		! load one byte
+	bz,pt	%ncc, .smallexit
+	stb	%o3, [%o0+4]		! store one byte
+	ldub	[%o1+5], %o3		! load second byte
+	deccc	%o2
+	bz,pt	%ncc, .smallexit
+	stb	%o3, [%o0+5]		! store second byte
+	ldub	[%o1+6], %o3		! load third byte
+	stb	%o3, [%o0+6]		! store third byte
+.smallexit:
+	retl
+	mov	%g1, %o0		! restore %o0
+	.align 16
+.medium:
+	neg	%o0, %o5
+	neg	%o1, %o3	
+	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
+	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
+	
+	bz	%ncc, 2f
+	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
+				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
+
+	sub	%o2, %o5, %o2	! update count
+
+1:
+	ldub	[%o1], %o4
+	deccc	%o5
+	inc	%o1
+	stb	%o4, [%o0]
+	bgu,pt	%ncc, 1b
+	inc	%o0
+
+	! Now DST is 8-byte aligned.  o0, o1, o2 are current.
+
+2:
+	andcc	%o1, 0x3, %g0		! test alignment
+	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
+					! if src, dst not aligned
+	prefetch [%o1 + (1 * BLOCK_SIZE)], 20
+
+/*
+ * Handle all cases where src and dest are aligned on word
+ * or long word boundaries.  Use unrolled loops for better
+ * performance.  This option wins over standard large data
+ * move when source and destination is in cache for medium
+ * to short data moves.
+ */
+	andcc	%o1, 0x7, %g0		! test word alignment
+	bz,pt	%ncc, .medlword		! branch to long word aligned case
+	prefetch [%o1 + (2 * BLOCK_SIZE)], 20
+	cmp	%o2, MED_WMAX		! limit to store buffer size
+	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
+	nop
+	subcc	%o2, 15, %o2		! adjust length to allow cc test
+					! for end of loop
+	ble,pt	%ncc, .medw15		! skip big loop if less than 16
+	prefetch [%o1 + (3 * BLOCK_SIZE)], 20
+/*
+ * no need to put prefetch in loop as prefetches have
+ * already been issued for maximum loop size
+ */
+.medw16:
+	ld	[%o1], %o4		! load
+	subcc	%o2, 16, %o2		! decrement length count
+	stw	%o4, [%o0]		! and store
+	ld	[%o1+4], %o3		! a block of 16 bytes
+	add	%o1, 16, %o1		! increase src ptr by 16
+	stw	%o3, [%o0+4]
+	ld	[%o1-8], %o4
+	add	%o0, 16, %o0		! increase dst ptr by 16
+	stw	%o4, [%o0-8]
+	ld	[%o1-4], %o3
+	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
+	stw	%o3, [%o0-4]
+.medw15:
+	addcc	%o2, 15, %o2		! restore count
+	bz,pt	%ncc, .medwexit		! exit if finished
+	nop
+	cmp	%o2, 8
+	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
+	nop				!
+	ld	[%o1], %o4		! load 4 bytes
+	subcc	%o2, 8, %o2		! decrease count by 8
+	stw	%o4, [%o0]		! and store 4 bytes
+	add	%o1, 8, %o1		! increase src ptr by 8
+	ld	[%o1-4], %o3		! load 4 bytes
+	add	%o0, 8, %o0		! increase dst ptr by 8
+	stw	%o3, [%o0-4]		! and store 4 bytes
+	bz	%ncc, .medwexit		! exit if finished
+	nop
+.medw7:					! count is ge 1, less than 8
+	cmp	%o2, 3			! check for 4 bytes left
+	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
+	nop				!
+	ld	[%o1], %o4		! load 4 bytes
+	sub	%o2, 4, %o2		! decrease count by 4
+	add	%o1, 4, %o1		! increase src ptr by 4
+	stw	%o4, [%o0]		! and store 4 bytes
+	add	%o0, 4, %o0		! increase dst ptr by 4
+	tst	%o2			! check for zero bytes left
+	bz	%ncc, .medwexit		! exit if finished
+	nop
+.medw3:					! count is known to be 1, 2, or 3
+	deccc	%o2			! reduce count by one
+	ldub	[%o1], %o3		! load one byte
+	bz,pt	%ncc, .medwexit		! exit if last byte
+	stb	%o3, [%o0]		! store one byte
+	ldub	[%o1+1], %o3		! load second byte
+	deccc	%o2			! reduce count by one
+	bz,pt	%ncc, .medwexit		! exit if last byte
+	stb	%o3, [%o0+1]		! store second byte
+	ldub	[%o1+2], %o3		! load third byte
+	stb	%o3, [%o0+2]		! store third byte
+.medwexit:
+	retl
+	mov	%g1, %o0		! restore %o0
+	
+/*
+ * Special case for handling when src and dest are both long word aligned
+ * and total data to move is between SMALL_MAX and MED_MAX bytes
+ */
+
+	.align 16
+	nop
+.medlword:				! long word aligned
+					! length > SMALL_MAX
+	cmp	%o2, MED_MAX		! limit to store buffer size
+	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
+	nop
+	subcc	%o2, 31, %o2		! adjust length to allow cc test
+					! for end of loop
+	ble,pt	%ncc, .medl31		! skip big loop if less than 32
+	prefetch [%o1 + (3 * BLOCK_SIZE)], 20	! into the l2 cache
+/*
+ * no need to put prefetch in loop as prefetches have
+ * already been issued for maximum loop size
+ */
+.medl32:
+	ldx	[%o1], %o4		! load
+	subcc	%o2, 32, %o2		! decrement length count
+	stx	%o4, [%o0]		! and store
+	ldx	[%o1+8], %o3		! a block of 32 bytes
+	add	%o1, 32, %o1		! increase src ptr by 32
+	stx	%o3, [%o0+8]
+	ldx	[%o1-16], %o4
+	add	%o0, 32, %o0		! increase dst ptr by 32
+	stx	%o4, [%o0-16]
+	ldx	[%o1-8], %o3
+	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
+	stx	%o3, [%o0-8]
+.medl31:
+	addcc	%o2, 16, %o2		! adjust remaining count
+	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
+	nop				!
+	ldx	[%o1], %o4		! load and store 16 bytes
+	add	%o1, 16, %o1		! increase src ptr by 16
+	stx	%o4, [%o0]		!
+	sub	%o2, 16, %o2		! decrease count by 16
+	ldx	[%o1-8], %o3		!
+	add	%o0, 16, %o0		! increase dst ptr by 16
+	stx	%o3, [%o0-8]
+.medl15:
+	addcc	%o2, 15, %o2		! restore count
+	bz,pt	%ncc, .medwexit		! exit if finished
+	nop
+	cmp	%o2, 8
+	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
+	nop
+	ldx	[%o1], %o4		! load 8 bytes
+	add	%o1, 8, %o1		! increase src ptr by 8
+	stx	%o4, [%o0]		! and store 8 bytes
+	subcc	%o2, 8, %o2		! decrease count by 8
+	bz	%ncc, .medwexit		! exit if finished
+	add	%o0, 8, %o0		! increase dst ptr by 8
+	ba	.medw7
+	nop
+
+	.align 16
+	nop
+	nop
+	nop
+.mediumsetup:
+	prefetch [%o1 + (2 * BLOCK_SIZE)], 21
+.mediumrejoin:
+	rd	%fprs, %o4		! check for unused FPU
+	
+	add	%o1, 8, %o1		! prepare to round SRC upward
+
+	sethi	%hi(0x1234567f), %o5	! For GSR.MASK 
+	or	%o5, 0x67f, %o5
+
+	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
+	bz,a	%ncc, 3f
+	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
+3:
+	cmp	%o2, MEDIUM_MAX
+	bmask	%o5, %g0, %g0
+
+	! Compute o5 (number of bytes that need copying using the main loop).
+	! First, compute for the medium case.
+	! Then, if large case, o5 is replaced by count for block alignment.
+	! Be careful not to read past end of SRC
+	! Currently, o2 is the actual count remaining
+	!            o3 is how much sooner we'll cross the alignment boundary
+	!                in SRC compared to in DST
+	!
+	! Examples:  Let # denote bytes that should not be accessed
+	!            Let x denote a byte already copied to align DST
+	!            Let . and - denote bytes not yet copied
+	!            Let | denote double alignment boundaries
+	!
+	!            DST:  ######xx|........|--------|..######   o2 = 18
+	!                          o0
+	!
+	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
+	!                          o1
+	!
+	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
+	!                                   o1
+	!
+	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
+	!                                   o1
+
+	or	%g0, -8, %o5
+	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1
+
+	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
+	add	%o5, %o2, %o5
+	add	%o5, %o3, %o5
+
+	bleu	%ncc, 4f
+	andn	%o5, 7, %o5		! 8 byte aligned count
+	neg	%o0, %o5		! 'large' case
+	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
+4:	
+	brgez,a	%o3, .beginmedloop
+	ldd	[%o1-8], %d0
+
+	add	%o1, %o3, %o1		! back up o1
+5:
+	ldda	[%o1]ASI_FL8_P, %d2
+	inc	%o1
+	andcc	%o1, 7, %g0
+	bnz	%ncc, 5b
+	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
+
+.beginmedloop:	
+	tst	%o5
+	bz	%ncc, .endmedloop
+	sub	%o2, %o5, %o2		! update count for later
+
+	! Main loop to write out doubles.  Note: o5 & 7 == 0
+	
+	ldd	[%o1], %d2
+	subcc	%o5, 8, %o5		! update local count
+	bz,pn	%ncc, 1f
+	add	%o1, 8, %o1		! update SRC
+
+.medloop:
+	faligndata %d0, %d2, %d4
+	ldd	[%o1], %d0
+	subcc	%o5, 8, %o5		! update local count
+	add	%o1, 16, %o1		! update SRC
+	std	%d4, [%o0]
+	bz,pn	%ncc, 2f
+	faligndata %d2, %d0, %d6
+	ldd	[%o1 - 8], %d2
+	subcc	%o5, 8, %o5		! update local count
+	std	%d6, [%o0 + 8]
+	bnz,pt	%ncc, .medloop
+	add	%o0, 16, %o0		! update DST
+
+1:	
+	faligndata %d0, %d2, %d4
+	fmovd	%d2, %d0
+	std	%d4, [%o0]
+	ba	.endmedloop
+	add	%o0, 8, %o0
+	
+2:
+	std	%d6, [%o0 + 8]
+	sub	%o1, 8, %o1
+	add	%o0, 16, %o0
+	
+
+.endmedloop:
+	! Currently, o1 is pointing to the next double-aligned byte in SRC
+	! The 8 bytes starting at [o1-8] are available in d0
+	! At least one, and possibly all, of these need to be written.
+
+	cmp	%o2, BLOCK_SIZE	
+	bgu	%ncc, .large		! otherwise, less than 16 bytes left
+	
+#if 0
+
+	/* This code will use partial stores.  */
+
+	mov	%g0, %o5
+	and	%o3, 7, %o3		! Number of bytes needed to completely
+					! fill %d0 with good (unwritten) data.
+
+	subcc	%o2, 8, %o2		! update count (maybe too much)
+	movl	%ncc, %o2, %o5		
+	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
+	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)
+
+	bz	%ncc, 2f
+	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
+	
+1:
+	deccc	%o5
+	ldda	[%o1]ASI_FL8_P, %d2
+	inc	%o1
+	bgu	%ncc, 1b
+	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
+
+2:
+	not     %o3
+	faligndata %d0, %d0, %d0	! shift bytes to the left
+	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
+	edge8n	%g0, %o3, %o5
+	stda	%d0, [%o0]%o5, ASI_PST8_P
+	brlez	%o2, .mediumexit		
+	add	%o0, %o3, %o0		! update DST to last stored byte
+3:	
+	inc	%o0
+	deccc	%o2
+	ldub	[%o1], %o3
+	stb	%o3, [%o0]
+	bgu	%ncc, 3b
+	inc	%o1
+
+#else
+
+	andcc	%o3, 7, %o5		! Number of bytes needed to completely
+					! fill %d0 with good (unwritten) data.
+	bz	%ncc, 2f
+	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
+	cmp	%o2, 8
+	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
+	add	%o1, %o3, %o1 		! Back up %o1
+
+1:
+	deccc	%o5
+	ldda	[%o1]ASI_FL8_P, %d2
+	inc	%o1
+	bgu	%ncc, 1b
+	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
+
+2:	
+	subcc	%o2, 8, %o2
+	std	%d0, [%o0]
+	bz	%ncc, .mediumexit
+	add	%o0, 8, %o0
+3:	
+	ldub	[%o1], %o3
+	deccc	%o2
+	inc	%o1
+	stb	%o3, [%o0]
+	bgu	%ncc, 3b
+	inc	%o0
+#endif	
+
+.mediumexit:
+        wr      %o4, %g0, %fprs		! fprs = o4   restore fprs
+	retl
+        mov     %g1, %o0
+
+
+	.align ICACHE_LINE_SIZE
+.large:
+	! The following test for BSTORE_SIZE is used to decide whether
+	! to store data with a block store or with individual stores.
+	! The block store wins when the amount of data is so large
+	! that it is causes other application data to be moved out
+	! of the L1 or L2 cache.
+	! On a Panther, block store can lose more often because block
+	! store forces the stored data to be removed from the L3 cache.
+	!
+	sethi	%hi(BSTORE_SIZE),%o5
+	or	%o5,%lo(BSTORE_SIZE),%o5
+	cmp	%o2, %o5
+	bgu	%ncc, .xlarge		
+
+	! %o0 I/O DST is 64-byte aligned
+	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
+	! %d0 I/O already loaded with SRC data from [%o1-8]
+	! %o2 I/O count (number of bytes that need to be written)
+	! %o3 I   Not written.  If zero, then SRC is double aligned.
+	! %o4 I   Not written.  Holds fprs.
+	! %o5   O The number of doubles that remain to be written.
+
+	! Load the rest of the current block 
+	! Recall that %o1 is further into SRC than %o0 is into DST
+
+	prefetch [%o0 + (0 * BLOCK_SIZE)], 22
+	prefetch [%o0 + (1 * BLOCK_SIZE)], 22
+	prefetch [%o0 + (2 * BLOCK_SIZE)], 22
+	ldd	[%o1], %f2
+	prefetch [%o1 + (3 * BLOCK_SIZE)], 21
+	ldd	[%o1 + 0x8], %f4
+	faligndata %f0, %f2, %f32
+	ldd	[%o1 + 0x10], %f6
+	faligndata %f2, %f4, %f34
+	ldd	[%o1 + 0x18], %f8
+	faligndata %f4, %f6, %f36
+	ldd	[%o1 + 0x20], %f10
+        or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
+	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
+	faligndata %f6, %f8, %f38
+	ldd	[%o1 + 0x28], %f12
+	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
+	faligndata %f8, %f10, %f40
+	ldd	[%o1 + 0x30], %f14
+	faligndata %f10, %f12, %f42
+	ldd	[%o1 + 0x38], %f0
+	sub	%o2, BLOCK_SIZE, %o2	! update count
+	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
+	add	%o1, BLOCK_SIZE, %o1		! update SRC
+
+	! Main loop.  Write previous block.  Load rest of current block.
+	! Some bytes will be loaded that won't yet be written.
+1:	
+	ldd	[%o1], %f2
+	faligndata %f12, %f14, %f44
+	ldd	[%o1 + 0x8], %f4
+	faligndata %f14, %f0, %f46
+	std	%f32, [%o0]
+	std	%f34, [%o0+8]
+	std	%f36, [%o0+16]
+	std	%f38, [%o0+24]
+	std	%f40, [%o0+32]
+	std	%f42, [%o0+40]
+	std	%f44, [%o0+48]
+	std	%f46, [%o0+56]
+	sub	%o2, BLOCK_SIZE, %o2		! update count
+	prefetch [%o0 + (6 * BLOCK_SIZE)], 22
+	prefetch [%o0 + (3 * BLOCK_SIZE)], 22
+	add	%o0, BLOCK_SIZE, %o0		! update DST
+	ldd	[%o1 + 0x10], %f6
+	faligndata %f0, %f2, %f32
+	ldd	[%o1 + 0x18], %f8
+	faligndata %f2, %f4, %f34
+	ldd	[%o1 + 0x20], %f10
+	faligndata %f4, %f6, %f36
+	ldd	[%o1 + 0x28], %f12
+	faligndata %f6, %f8, %f38
+	ldd	[%o1 + 0x30], %f14
+	faligndata %f8, %f10, %f40
+	ldd	[%o1 + 0x38], %f0
+	faligndata %f10, %f12, %f42
+	cmp	%o2, BLOCK_SIZE + 8
+	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
+	bgu,pt	%ncc, 1b
+	add	%o1, BLOCK_SIZE, %o1	! update SRC
+	faligndata %f12, %f14, %f44
+	faligndata %f14, %f0, %f46
+	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
+	cmp	%o2, BLOCK_SIZE		
+	bne	%ncc, 2f		! exactly 1 block remaining?
+	add	%o0, BLOCK_SIZE, %o0	! update DST
+	brz,a	%o3, 3f			! is SRC double aligned?
+	ldd	[%o1], %f2
+
+2:	
+	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
+	add	%o5, %o3, %o5
+
+	membar	#StoreLoad|#StoreStore
+
+	ba	.beginmedloop
+	andn	%o5, 7, %o5		! 8 byte aligned count
+
+
+	! This is when there is exactly 1 block remaining and SRC is aligned
+3:
+	ldd	[%o1 + 0x8], %f4
+	ldd	[%o1 + 0x10], %f6
+	fsrc1	%f0, %f32
+	ldd	[%o1 + 0x18], %f8
+	fsrc1	%f2, %f34
+	ldd	[%o1 + 0x20], %f10
+	fsrc1	%f4, %f36
+	ldd	[%o1 + 0x28], %f12
+	fsrc1	%f6, %f38
+	ldd	[%o1 + 0x30], %f14
+	fsrc1	%f8, %f40
+	fsrc1	%f10, %f42
+	fsrc1	%f12, %f44
+	fsrc1	%f14, %f46
+	stda	%f32, [%o0]ASI_BLK_P
+	membar	#StoreLoad|#StoreStore
+	wr	%o4, 0, %fprs
+	retl
+	mov	%g1, %o0
+
+
+	.align 16
+	! two nops here causes loop starting at 1f below to be
+	! on a cache line boundary, improving performance
+	nop
+	nop
+.xlarge:
+	! %o0 I/O DST is 64-byte aligned
+	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
+	! %d0 I/O already loaded with SRC data from [%o1-8]
+	! %o2 I/O count (number of bytes that need to be written)
+	! %o3 I   Not written.  If zero, then SRC is double aligned.
+	! %o4 I   Not written.  Holds fprs.
+	! %o5   O The number of doubles that remain to be written.
+
+	! Load the rest of the current block 
+	! Recall that %o1 is further into SRC than %o0 is into DST
+
+	! prefetch [%o1 + (3 * BLOCK_SIZE)], 21
+	! executed in delay slot for branch to .xlarge
+	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
+	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
+	ldd	[%o1], %f2
+	prefetch [%o1 + (6 * BLOCK_SIZE)], 21
+	ldd	[%o1 + 0x8], %f4
+	faligndata %f0, %f2, %f32
+	ldd	[%o1 + 0x10], %f6
+	faligndata %f2, %f4, %f34
+	ldd	[%o1 + 0x18], %f8
+	faligndata %f4, %f6, %f36
+	ldd	[%o1 + 0x20], %f10
+        or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
+	faligndata %f6, %f8, %f38
+	ldd	[%o1 + 0x28], %f12
+	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
+	faligndata %f8, %f10, %f40
+	ldd	[%o1 + 0x30], %f14
+	faligndata %f10, %f12, %f42
+	ldd	[%o1 + 0x38], %f0
+	sub	%o2, BLOCK_SIZE, %o2	! update count
+	prefetch [%o1 + (7 * BLOCK_SIZE)], 21
+	add	%o1, BLOCK_SIZE, %o1	! update SRC
+
+	! This point is 32-byte aligned since 24 instructions appear since
+	! the previous alignment directive.
+	
+
+	! Main loop.  Write previous block.  Load rest of current block.
+	! Some bytes will be loaded that won't yet be written.
+1:
+	ldd	[%o1], %f2
+	faligndata %f12, %f14, %f44
+	ldd	[%o1 + 0x8], %f4
+	faligndata %f14, %f0, %f46
+	stda	%f32, [%o0]ASI_BLK_P
+	sub	%o2, BLOCK_SIZE, %o2		! update count
+	ldd	[%o1 + 0x10], %f6
+	faligndata %f0, %f2, %f32
+	ldd	[%o1 + 0x18], %f8
+	faligndata %f2, %f4, %f34
+	ldd	[%o1 + 0x20], %f10
+	faligndata %f4, %f6, %f36
+	ldd	[%o1 + 0x28], %f12
+	faligndata %f6, %f8, %f38
+	ldd	[%o1 + 0x30], %f14
+	faligndata %f8, %f10, %f40
+	ldd	[%o1 + 0x38], %f0
+	faligndata %f10, %f12, %f42
+	! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
+	prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21
+	add	%o0, BLOCK_SIZE, %o0		! update DST
+	cmp	%o2, BLOCK_SIZE + 8
+	! second prefetch important to correct for occasional dropped
+	! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
+	! strong prefetch prevents drops on Panther, but Jaguar and earlier
+	! US-III models treat strong prefetches as weak prefetchs
+	! to avoid regressions on customer hardware, we retain the prefetch
+	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
+	bgu,pt	%ncc, 1b
+	add	%o1, BLOCK_SIZE, %o1		! update SRC
+
+	faligndata %f12, %f14, %f44
+	faligndata %f14, %f0, %f46
+	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
+	cmp	%o2, BLOCK_SIZE		
+	bne	%ncc, 2f		! exactly 1 block remaining?
+	add	%o0, BLOCK_SIZE, %o0	! update DST
+	brz,a	%o3, 3f			! is SRC double aligned?
+	ldd	[%o1], %f2
+
+2:	
+	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
+	add	%o5, %o3, %o5
+
+	membar	#StoreLoad|#StoreStore
+
+	ba	.beginmedloop
+	andn	%o5, 7, %o5		! 8 byte aligned count
+
+
+	! This is when there is exactly 1 block remaining and SRC is aligned
+3:
+	ldd	[%o1 + 0x8], %f4
+	ldd	[%o1 + 0x10], %f6
+	fsrc1	%f0, %f32
+	ldd	[%o1 + 0x18], %f8
+	fsrc1	%f2, %f34
+	ldd	[%o1 + 0x20], %f10
+	fsrc1	%f4, %f36
+	ldd	[%o1 + 0x28], %f12
+	fsrc1	%f6, %f38
+	ldd	[%o1 + 0x30], %f14
+	fsrc1	%f8, %f40
+	fsrc1	%f10, %f42
+	fsrc1	%f12, %f44
+	fsrc1	%f14, %f46
+	stda	%f32, [%o0]ASI_BLK_P
+	membar	#StoreLoad|#StoreStore
+	wr	%o4, 0, %fprs
+	retl
+	mov	%g1, %o0
+	
+	SET_SIZE(memcpy)