6858457 Remove Solaris support for UltraSPARC-AT10 processor

author: Jason Beloro <Jason.Beloro@Sun.COM> 2009-08-06 17:39:39 -0700
committer: Jason Beloro <Jason.Beloro@Sun.COM> 2009-08-06 17:39:39 -0700
commit: 9d0d62ad2e60e8f742a2e723d06e88352ee6a1f3 (patch)
tree: 016e2a6b2f674016c46785258d0ff85e6b1bce09 /usr/src/lib/libc
parent: 32a6953793c636df949ca1ae3555438159bda3f6 (diff)
download: illumos-joyent-9d0d62ad2e60e8f742a2e723d06e88352ee6a1f3.tar.gz
7 files changed, 4 insertions, 2977 deletions
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s
deleted file mode 100644
index 8fdb95268f..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s
+++ /dev/null
@@ -1,1704 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-	.file	"memcpy.s"
-
-/*
- * memcpy(s1, s2, len)
- *
- * Copy s2 to s1, always copy n bytes.
- * Note: this C code does not work for overlapped copies.
- *       Memmove() and bcopy() do.
- *
- * Added entry __align_cpy_1 is generally for use of the compilers.
- *
- * Fast assembler language version of the following C-program for memcpy
- * which represents the `standard' for the C-library.
- *
- *	void * 
- *	memcpy(void *s, const void *s0, size_t n)
- *	{
- *		if (n != 0) {
- *	   	    char *s1 = s;
- *		    const char *s2 = s0;
- *		    do {
- *			*s1++ = *s2++;
- *		    } while (--n != 0);
- *		}
- *		return (s);
- *	}
- */
-
-#include <sys/asm_linkage.h>
-#include <sys/sun4asi.h>
-#include <sys/trap.h>
-
-#ifdef	__sparcv9
-#define	SAVESIZE	(8 * 1)
-#define	STACK_OFFSET	(STACK_BIAS + MINFRAME)
-#else
-#define	SAVESIZE	(8 * 3)
-#define	STACK_OFFSET	(STACK_BIAS + MINFRAME + 4)
-#endif
-
-#define	scratch_offset	0
-#define	g4_offset	8
-#define	g5_offset	16
-
-#define	ICACHE_LINE_SIZE	64
-#define	BLOCK_SIZE	64
-#define	FPRS_FEF	0x4
-#define	PF_FAR		2048
-#define	PF_NEAR		1024
-
-#define SHORTCOPY	3
-#define	SMALL_MAX	39
-#define	MEDIUM_MAX	255
-#define MED_WMAX	256	/* max copy for medium word-aligned case */
-#define MED_MAX		256	/* max copy for medium longword-aligned case */
-
-#ifndef BSTORE_SIZE
-#define BSTORE_SIZE	256	/* min copy size for block store */
-#endif
-
-/*
- * The LDDs will use the below ASI for performance
- * This ASI minimizes cache pollution.
- */
-#define	ASI_CACHE_SPARING	0xf4
-#define	ASI_CACHE_SPARING_PRIMARY	0xf4
-
-	ANSI_PRAGMA_WEAK(memmove,function)
-	ANSI_PRAGMA_WEAK(memcpy,function)
-
-	ENTRY(memmove)
-	cmp	%o1, %o0	! if from address is >= to use forward copy
-	bgeu	%ncc, .forcpy	! else use backward if ...
-	sub	%o0, %o1, %o4	! get difference of two addresses
-	cmp	%o2, %o4	! compare size and difference of addresses
-	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
-	nop
-
-	!
-	! an overlapped copy that must be done "backwards"
-	!
-.ovbc:
-	mov	%o0, %g1		! save dest address for return val
-	add     %o1, %o2, %o1	   ! get to end of source space
-	add     %o0, %o2, %o0	   ! get to end of destination space
-
-	cmp	%o2, 24
-	bgeu,pn	%ncc, .dbalign
-	nop
-	cmp	%o2, 4
-	blt,pn	%ncc, .byte
-	sub	%o2, 3, %o2
-.byte4loop:
-	ldub	[%o1-1], %o3		! load last byte
-	stb	%o3, [%o0-1]		! store last byte
-	sub	%o1, 4, %o1
-	ldub	[%o1+2], %o3		! load 2nd from last byte
-	stb	%o3, [%o0-2]		! store 2nd from last byte
-	sub	%o0, 4, %o0
-	ldub	[%o1+1], %o3		! load 3rd from last byte
-	stb	%o3, [%o0+1]		! store 3rd from last byte
-	subcc	%o2, 4, %o2
-	ldub	[%o1], %o3		! load 4th from last byte
-	bgu,pt	%ncc, .byte4loop
-	stb	%o3, [%o0]		! store 4th from last byte
-.byte:
-	addcc	%o2, 3, %o2
-	bz,pt	%ncc, .exit
-.byteloop:
-	dec	%o1			! decrement src address
-	ldub	[%o1], %o3		! read a byte
-	dec	%o0			! decrement dst address
-	deccc	%o2			! decrement count
-	bgu,pt	%ncc, .byteloop		! loop until done
-	stb	%o3, [%o0]		! write byte
-.exit:
-	retl
-	mov	%g1, %o0
-
-	.align	16
-.dbalign:
-	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
-	bz,pt	%ncc, .dbmed
-	sub	%o2, %o5, %o2		! update count
-.dbalign1:
-	dec	%o1			! decrement src address
-	ldub	[%o1], %o3		! read a byte
-	dec	%o0			! decrement dst address
-	deccc	%o5			! decrement count
-	bgu,pt	%ncc, .dbalign1		! loop until done
-	stb	%o3, [%o0]		! store a byte
-
-! check for src long word alignment
-.dbmed:
-	mov	%asi, %g5		! save curr %asi
-	wr	%g0, ASI_CACHE_SPARING, %asi
-	andcc	%o1, 7, %g0		! chk src long word alignment
-	bnz,pn	%ncc, .dbbck
-	nop
-!
-! Following code is for overlapping copies where src and dest
-! are long word aligned
-!
-	cmp	%o2, 4095
-	blt,pn	%ncc, .dbmedl32enter	! go to no prefetch code
-	nop
-	prefetch [%o1 - (1 * BLOCK_SIZE)], #n_reads
-	sub	%o2, 63, %o2		! adjust length to allow cc test
-					! for end of loop
-	prefetch [%o1 - (2 * BLOCK_SIZE)], #n_reads
-	prefetch [%o1 - (3 * BLOCK_SIZE)], #n_reads
-	prefetch [%o1 - (4 * BLOCK_SIZE)], #n_reads
-.dbmedl64:
-	prefetch [%o1 - (5 * BLOCK_SIZE)], #n_reads
-	ldxa	[%o1-8]%asi, %o3	! load
-	subcc	%o2, 64, %o2		! decrement length count
-	stx	%o3, [%o0-8]		! and store
-	ldxa	[%o1-16]%asi, %o3	! a block of 64 bytes
-	sub	%o1, 64, %o1		! decrease src ptr by 64
-	stx	%o3, [%o0-16]
-	sub	%o0, 64, %o0		! decrease dst ptr by 64
-	ldxa	[%o1+40]%asi, %o3
-	ldxa	[%o1+32]%asi, %o4
-	ldxa	[%o1+24]%asi, %o5
-	stx	%o3, [%o0+40]
-	stx	%o4, [%o0+32]
-	stx	%o5, [%o0+24]
-	ldxa	[%o1+16]%asi, %o3
-	ldxa	[%o1+8]%asi, %o4
-	stx	%o3, [%o0+16]
-	stx	%o4, [%o0+8]
-	ldxa	[%o1]%asi, %o5
-	bgu,pt	%ncc, .dbmedl64		! repeat if at least 64 bytes left
-	stx	%o5, [%o0]
-	add	%o2, 63, %o2		! restore offset adjustment
-.dbmedl32enter:
-	subcc	%o2, 31, %o2		! adjust length to allow cc test
-					! for end of loop
-	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
-	nop
-.dbmedl32:
-	ldx	[%o1-8], %o4		! load
-	subcc	%o2, 32, %o2		! decrement length count
-	stx	%o4, [%o0-8]		! and store
-	ldx	[%o1-16], %o3		! a block of 32 bytes
-	sub	%o1, 32, %o1		! decrease src ptr by 32
-	stx	%o3, [%o0-16]
-	ldx	[%o1+8], %o4
-	sub	%o0, 32, %o0		! decrease dst ptr by 32
-	stx	%o4, [%o0+8]
-	ldx	[%o1], %o3
-	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
-	stx	%o3, [%o0]
-.dbmedl31:
-	addcc	%o2, 16, %o2		! adjust remaining count
-	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
-	nop				!
-	ldx	[%o1-8], %o4		! load and store 16 bytes
-	sub	%o1, 16, %o1		! decrease src ptr by 16
-	stx	%o4, [%o0-8]		!
-	sub	%o2, 16, %o2		! decrease count by 16
-	ldx	[%o1], %o3		!
-	sub	%o0, 16, %o0		! decrease dst ptr by 16
-	stx	%o3, [%o0]
-.dbmedl15:
-	addcc	%o2, 15, %o2		! restore count
-	bz,pt	%ncc, .dbexit		! exit if finished
-	nop
-	cmp	%o2, 8
-	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
-	nop
-	ldx	[%o1-8], %o4		! load 8 bytes
-	sub	%o1, 8, %o1		! decrease src ptr by 8
-	stx	%o4, [%o0-8]		! and store 8 bytes
-	subcc	%o2, 8, %o2		! decrease count by 8
-	bnz	%ncc, .dbremain		! exit if finished
-	sub	%o0, 8, %o0		! decrease dst ptr by 8
-	mov	%g5, %asi		! restore %asi
-	retl
-	mov	%g1, %o0
-
-!
-! Following code is for overlapping copies where src and dest
-! are not long word aligned
-!
-	.align	16
-.dbbck:
-	rd	%fprs, %o3		! o3 = fprs
- 
-	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
-	! So set it anyway, without checking.
-	wr      %g0, FPRS_FEF, %fprs	 ! fprs.fef = 1
-
-	alignaddr %o1, %g0, %o5		! align src
-	ldda	[%o5]%asi, %d0		! get first 8 byte block
-	andn	%o2, 7, %o4		! prepare src ptr for finishup code
-	cmp	%o2, 32
-	blt,pn	%ncc, .dbmv8
-	sub	%o1, %o4, %o1		!
-	cmp	%o2, 4095		! check for short memmoves
-	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
-.dbmv64:
-	ldda	[%o5-8]%asi, %d2	! load 8 bytes
-	ldda	[%o5-16]%asi, %d4	! load 8 bytes
-	sub	%o5, 64, %o5		!
-	ldda	[%o5+40]%asi, %d6	! load 8 bytes
-	sub	%o0, 64, %o0		!
-	ldda	[%o5+32]%asi, %d8	! load 8 bytes
-	sub	%o2, 64, %o2		! 64 less bytes to copy
-	ldda	[%o5+24]%asi, %d18	! load 8 bytes
-	cmp	%o2, 64			! do we have < 64 bytes remaining
-	ldda	[%o5+16]%asi, %d28	! load 8 bytes
-	ldda	[%o5+8]%asi, %d30	! load 8 bytes
-	prefetch [%o5 - (5 * BLOCK_SIZE)], #n_reads
-	faligndata %d2, %d0, %d10	! extract 8 bytes out
-	ldda	[%o5]%asi, %d0		! load 8 bytes
-	std	%d10, [%o0+56]		! store the current 8 bytes
-	faligndata %d4, %d2, %d12	! extract 8 bytes out
-	std	%d12, [%o0+48]		! store the current 8 bytes
-	faligndata %d6, %d4, %d14	! extract 8 bytes out
-	std	%d14, [%o0+40]		! store the current 8 bytes
-	faligndata %d8, %d6, %d16	! extract 8 bytes out
-	std	%d16, [%o0+32]		! store the current 8 bytes
-	faligndata %d18, %d8, %d20	! extract 8 bytes out
-	std	%d20, [%o0+24]		! store the current 8 bytes
-	faligndata %d28, %d18, %d22	! extract 8 bytes out
-	std	%d22, [%o0+16]		! store the current 8 bytes
-	faligndata %d30, %d28, %d24	! extract 8 bytes out
-	std	%d24, [%o0+8]		! store the current 8 bytes
-	faligndata %d0, %d30, %d26	! extract 8 bytes out
-	bgeu,pt	%ncc, .dbmv64
-	std	%d26, [%o0]		! store the current 8 bytes
-
-	cmp	%o2, 32
-	blt,pn	%ncc, .dbmvx
-	nop
-.dbmv32:
-	ldda	[%o5-8]%asi, %d2	! load 8 bytes
-.dbmv32enter:
-	ldda	[%o5-16]%asi, %d4	! load 8 bytes
-	sub	%o5, 32, %o5		!
-	ldda	[%o5+8]%asi, %d6	! load 8 bytes
-	sub	%o0, 32, %o0		! 
-	faligndata %d2, %d0, %d10	! extract 8 bytes out
-	ldda	[%o5]%asi, %d0		! load 8 bytes
-	sub     %o2,32, %o2		! 32 less bytes to copy
-	std	%d10, [%o0+24]		! store the current 8 bytes
-	cmp	%o2, 32			! do we have < 32 bytes remaining
-	faligndata %d4, %d2, %d12	! extract 8 bytes out
-	std	%d12, [%o0+16]		! store the current 8 bytes
-	faligndata %d6, %d4, %d14	! extract 8 bytes out
-	std	%d14, [%o0+8]		! store the current 8 bytes
-	faligndata %d0, %d6, %d16	! extract 8 bytes out
-	bgeu,pt	%ncc, .dbmv32
-	std	%d16, [%o0]		! store the current 8 bytes
-.dbmvx:
-	cmp	%o2, 8			! do we have < 8 bytes remaining
-	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
-	nop
-.dbmv8:
-	ldda	[%o5-8]%asi, %d2
-	sub	%o0, 8, %o0		! since we are at the end
-					! when we first enter the loop
-	sub     %o2, 8, %o2		! 8 less bytes to copy
-	sub	%o5, 8, %o5
-	cmp	%o2, 8			! do we have < 8 bytes remaining
-	faligndata %d2, %d0, %d8	! extract 8 bytes out
-	std	%d8, [%o0]		! store the current 8 bytes
-	bgeu,pt	%ncc, .dbmv8
-	fmovd	%d2, %d0
-.dbmvfinish:
-	and	%o3, 0x4, %o3	   ! fprs.du = fprs.dl = 0
-	tst	%o2
-	bz,pt	%ncc, .dbexit
-	wr	%o3, %g0, %fprs	 ! fprs = o3   restore fprs
-
-.dbremain:
-	cmp	%o2, 4
-	blt,pn	%ncc, .dbbyte
-	nop
-	ldub	[%o1-1], %o3		! load last byte
-	stb	%o3, [%o0-1]		! store last byte
-	sub	%o1, 4, %o1
-	ldub	[%o1+2], %o3		! load 2nd from last byte
-	stb	%o3, [%o0-2]		! store 2nd from last byte
-	sub	%o0, 4, %o0
-	ldub	[%o1+1], %o3		! load 3rd from last byte
-	stb	%o3, [%o0+1]		! store 3rd from last byte
-	subcc	%o2, 4, %o2
-	ldub	[%o1], %o3		! load 4th from last byte
-	stb	%o3, [%o0]		! store 4th from last byte	
-	bz,pt	%ncc, .dbexit
-.dbbyte:
-	dec	%o1			! decrement src address
-	ldub	[%o1], %o3		! read a byte
-	dec	%o0			! decrement dst address
-	deccc	%o2			! decrement count
-	bgu,pt	%ncc, .dbbyte		! loop until done
-	stb	%o3, [%o0]		! write byte
-.dbexit:
-	mov	%g5, %asi		! restore %asi
-	retl
-	mov     %g1, %o0
-	SET_SIZE(memmove)
-
-	.align ICACHE_LINE_SIZE
-	ENTRY(memcpy)
-	ENTRY(__align_cpy_1)
-					! adjust instruction alignment
-	nop				! Do not remove, these nops affect
-	nop				! icache alignment and performance
-.forcpy:
-	cmp	%o2, SMALL_MAX		! check for not small case
-	bgu,pn	%ncc, .medium		! go to larger cases
-	mov	%o0, %g1		! save %o0
-	cmp	%o2, SHORTCOPY		! check for really short case
-	ble,pt	%ncc, .smallleft	!
-	or	%o0, %o1, %o3		! prepare alignment check
-	andcc	%o3, 0x3, %g0		! test for alignment
-	bz,pt	%ncc, .smallword	! branch to word aligned case
-	sub	%o2, 3, %o2		! adjust count to allow cc zero test
-.smallnotalign4:
-	ldub	[%o1], %o3		! read byte
-	subcc	%o2, 4, %o2		! reduce count by 4
-	stb	%o3, [%o0]		! write byte
-	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
-	add	%o1, 4, %o1		! advance SRC by 4
-	stb	%o3, [%o0+1]
-	ldub	[%o1-2], %o3
-	add	%o0, 4, %o0		! advance DST by 4
-	stb	%o3, [%o0-2]
-	ldub	[%o1-1], %o3
-	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
-	stb	%o3, [%o0-1]
-	add	%o2, 3, %o2		! restore count
-.smallleft:
-	tst	%o2
-	bz,pt	%ncc, .smallexit
-	nop
-.smallleft3:				! 1, 2, or 3 bytes remain
-	ldub	[%o1], %o3		! load one byte
-	deccc	%o2			! reduce count for cc test
-	bz,pt	%ncc, .smallexit
-	stb	%o3, [%o0]		! store one byte
-	ldub	[%o1+1], %o3		! load second byte
-	deccc	%o2
-	bz,pt	%ncc, .smallexit
-	stb	%o3, [%o0+1]		! store second byte
-	ldub	[%o1+2], %o3		! load third byte
-	stb	%o3, [%o0+2]		! store third byte
-	retl
-	mov	%g1, %o0		! restore %o0
-
-	.align	16
-	nop				! affects loop icache alignment
-.smallwords:
-	lduw	[%o1], %o3		! read word
-.smallwordx:
-	subcc	%o2, 8, %o2		! update count
-	stw	%o3, [%o0]		! write word
-	add	%o1, 8, %o1		! update SRC
-	lduw	[%o1-4], %o3		! read word
-	add	%o0, 8, %o0		! update DST
-	bgu,pt	%ncc, .smallwords	! loop until done
-	stw	%o3, [%o0-4]		! write word
-	addcc	%o2, 7, %o2		! restore count
-	bz,pt	%ncc, .smallexit	! check for completion
-	nop
-	cmp	%o2, 4			! check for 4 or more bytes left
-	blt	.smallleft3		! if not, go to finish up
-	nop
-	lduw	[%o1], %o3
-	add	%o1, 4, %o1
-	subcc	%o2, 4, %o2
-	stw	%o3, [%o0]
-	add	%o0, 4, %o0
-	bnz,pt	%ncc, .smallleft3
-	nop
-	retl
-	mov	%g1, %o0		! restore %o0
-
-.smallword:
-	subcc	%o2, 4, %o2		! update count
-	bgu,pt	%ncc, .smallwordx
-	lduw	[%o1], %o3		! read word
-	addcc	%o2, 3, %o2		! restore count
-	bz,pt	%ncc, .smallexit
-	stw	%o3, [%o0]		! write word
-	deccc	%o2			! reduce count for cc test
-	ldub	[%o1+4], %o3		! load one byte
-	bz,pt	%ncc, .smallexit
-	stb	%o3, [%o0+4]		! store one byte
-	ldub	[%o1+5], %o3		! load second byte
-	deccc	%o2
-	bz,pt	%ncc, .smallexit
-	stb	%o3, [%o0+5]		! store second byte
-	ldub	[%o1+6], %o3		! load third byte
-	stb	%o3, [%o0+6]		! store third byte
-.smallexit:
-	retl
-	mov	%g1, %o0		! restore %o0
-	.align 16
-.medium:
-	neg	%o0, %o5
-	neg	%o1, %o3	
-	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
-	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
-	cmp	%o5, %o3
-	bne	%ncc, continue
-	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
-				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
-	! src and dst are aligned.
-	mov	%o3, %g5		! save %o3
-	andcc	%o1, 7, %o3		! is src buf  aligned on a 8 byte bound
-	brz,pt	%o3, src_dst_aligned_on_8		
-	mov	%o3, %o5
-	mov	8, %o4
-	sub 	%o4, %o3, %o3
-	cmp	%o3, %o2
-	bg,a,pn	%ncc, 1f
-	mov	%o2, %o3	
-1:
-	! %o3 has the bytes to be written in partial store.
-	sub	%o2, %o3, %o2
-	prefetch	[%o1],2
-
-7:
-	deccc	%o3			! byte clearing loop
-	ldub	[%o1], %o4		! load one byte
-	stb	%o4, [%o0]
-	inc	%o1			! increment src
-	bgu,pt	%ncc, 7b
-	inc	%o0			! increment dst
-
-	mov	%g5, %o3		! restore %o3
-src_dst_aligned_on_8:
-	! check  if we are copying 1k or more bytes
-	cmp	%o2, 511
-	bgu,pt	%ncc, copying_ge_512
-	nop
-	ba	.medlword
-	nop
-
-continue:
-	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
-	bz	%ncc, 2f
-	nop
-
-	sub	%o2, %o5, %o2	! update count
-
-1:
-	ldub	[%o1], %o4
-	deccc	%o5
-	inc	%o1
-	stb	%o4, [%o0]
-	bgu,pt	%ncc, 1b
-	inc	%o0
-
-	! Now DST is 8-byte aligned.  o0, o1, o2 are current.
-
-2:
-	andcc	%o1, 0x3, %g0		! test alignment
-	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
-					! if src, dst not aligned
-	prefetch [%o1 + (1 * BLOCK_SIZE)], #n_reads
-
-/*
- * Handle all cases where src and dest are aligned on word
- * or long word boundaries.  Use unrolled loops for better
- * performance.  This option wins over standard large data
- * move when source and destination is in cache for medium
- * to short data moves.
- */
-	andcc	%o1, 0x7, %g0		! test word alignment
-	bz,pt	%ncc, src_dst_lword_aligned	! branch to long word aligned case
-	prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads
-	cmp	%o2, MED_WMAX		! limit to store buffer size
-	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
-	nop
-	subcc	%o2, 15, %o2		! adjust length to allow cc test
-					! for end of loop
-	ble,pt	%ncc, .medw15		! skip big loop if less than 16
-	prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads
-/*
- * no need to put prefetch in loop as prefetches have
- * already been issued for maximum loop size
- */
-.medw16:
-	ld	[%o1], %o4		! load
-	subcc	%o2, 16, %o2		! decrement length count
-	stw	%o4, [%o0]		! and store
-	ld	[%o1+4], %o3		! a block of 16 bytes
-	add	%o1, 16, %o1		! increase src ptr by 16
-	stw	%o3, [%o0+4]
-	ld	[%o1-8], %o4
-	add	%o0, 16, %o0		! increase dst ptr by 16
-	stw	%o4, [%o0-8]
-	ld	[%o1-4], %o3
-	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
-	stw	%o3, [%o0-4]
-.medw15:
-	addcc	%o2, 15, %o2		! restore count
-	bz,pt	%ncc, .medwexit		! exit if finished
-	nop
-	cmp	%o2, 8
-	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
-	nop				!
-	ld	[%o1], %o4		! load 4 bytes
-	subcc	%o2, 8, %o2		! decrease count by 8
-	stw	%o4, [%o0]		! and store 4 bytes
-	add	%o1, 8, %o1		! increase src ptr by 8
-	ld	[%o1-4], %o3		! load 4 bytes
-	add	%o0, 8, %o0		! increase dst ptr by 8
-	stw	%o3, [%o0-4]		! and store 4 bytes
-	bz	%ncc, .medwexit		! exit if finished
-	nop
-.medw7:					! count is ge 1, less than 8
-	cmp	%o2, 3			! check for 4 bytes left
-	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
-	nop				!
-	ld	[%o1], %o4		! load 4 bytes
-	sub	%o2, 4, %o2		! decrease count by 4
-	add	%o1, 4, %o1		! increase src ptr by 4
-	stw	%o4, [%o0]		! and store 4 bytes
-	add	%o0, 4, %o0		! increase dst ptr by 4
-	tst	%o2			! check for zero bytes left
-	bz	%ncc, .medwexit		! exit if finished
-	nop
-.medw3:					! count is known to be 1, 2, or 3
-	deccc	%o2			! reduce count by one
-	ldub	[%o1], %o3		! load one byte
-	bz,pt	%ncc, .medwexit		! exit if last byte
-	stb	%o3, [%o0]		! store one byte
-	ldub	[%o1+1], %o3		! load second byte
-	deccc	%o2			! reduce count by one
-	bz,pt	%ncc, .medwexit		! exit if last byte
-	stb	%o3, [%o0+1]		! store second byte
-	ldub	[%o1+2], %o3		! load third byte
-	stb	%o3, [%o0+2]		! store third byte
-.medwexit:
-	retl
-	mov	%g1, %o0		! restore %o0
-	
-/*
- * Special case for handling when src and dest are both long word aligned
- * and total data to move is between SMALL_MAX and MED_MAX bytes
- */
-
-	.align 16
-	nop
-src_dst_lword_aligned:
-.medlword:				! long word aligned
-	cmp	%o2, MED_MAX		! limit to store buffer size
-	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
-	nop
-	subcc	%o2, 31, %o2		! adjust length to allow cc test
-					! for end of loop
-	ble,pt	%ncc, .medl31		! skip big loop if less than 32
-	prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads ! into the l2 cache
-/*
- * no need to put prefetch in loop as prefetches have
- * already been issued for maximum loop size
- */
-.medl32:
-	ldx	[%o1], %o4		! load
-	subcc	%o2, 32, %o2		! decrement length count
-	stx	%o4, [%o0]		! and store
-	ldx	[%o1+8], %o3		! a block of 32 bytes
-	add	%o1, 32, %o1		! increase src ptr by 32
-	stx	%o3, [%o0+8]
-	ldx	[%o1-16], %o4
-	add	%o0, 32, %o0		! increase dst ptr by 32
-	stx	%o4, [%o0-16]
-	ldx	[%o1-8], %o3
-	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
-	stx	%o3, [%o0-8]
-.medl31:
-	addcc	%o2, 16, %o2		! adjust remaining count
-	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
-	nop				!
-	ldx	[%o1], %o4		! load and store 16 bytes
-	add	%o1, 16, %o1		! increase src ptr by 16
-	stx	%o4, [%o0]		!
-	sub	%o2, 16, %o2		! decrease count by 16
-	ldx	[%o1-8], %o3		!
-	add	%o0, 16, %o0		! increase dst ptr by 16
-	stx	%o3, [%o0-8]
-.medl15:
-	addcc	%o2, 15, %o2		! restore count
-	bz,pt	%ncc, .medwexit		! exit if finished
-	nop
-	cmp	%o2, 8
-	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
-	nop
-	ldx	[%o1], %o4		! load 8 bytes
-	add	%o1, 8, %o1		! increase src ptr by 8
-	stx	%o4, [%o0]		! and store 8 bytes
-	subcc	%o2, 8, %o2		! decrease count by 8
-	bz	%ncc, .medwexit		! exit if finished
-	add	%o0, 8, %o0		! increase dst ptr by 8
-	ba	.medw7
-	nop
-
-	.align 16
-	nop
-	nop
-	nop
-unaligned_src_dst:
-
-.mediumsetup:
-	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
-.mediumrejoin:
-	rd	%fprs, %o4		! check for unused fp
-
-	add	%o1, 8, %o1		! prepare to round SRC upward
-
-	sethi	%hi(0x1234567f), %o5	! For GSR.MASK 
-	or	%o5, 0x67f, %o5
-	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
-	bz,a	%ncc, 3f
-	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
-3:
-	cmp	%o2, MEDIUM_MAX
-	bmask	%o5, %g0, %g0
-
-	! Compute o5 (number of bytes that need copying using the main loop).
-	! First, compute for the medium case.
-	! Then, if large case, o5 is replaced by count for block alignment.
-	! Be careful not to read past end of SRC
-	! Currently, o2 is the actual count remaining
-	!	    o3 is how much sooner we'll cross the alignment boundary
-	!		in SRC compared to in DST
-	!
-	! Examples:  Let # denote bytes that should not be accessed
-	!	    Let x denote a byte already copied to align DST
-	!	    Let . and - denote bytes not yet copied
-	!	    Let | denote double alignment boundaries
-	!
-	!	    DST:  ######xx|........|--------|..######   o2 = 18
-	!			  o0
-	!
-	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
-	!			  o1
-	!
-	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
-	!				   o1
-	!
-	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
-	!				   o1
-
-	mov	%asi, %g5		! save curr %asi
-	wr	%g0, ASI_CACHE_SPARING, %asi
-
-	or	%g0, -8, %o5
-	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1
-
-	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
-	add	%o5, %o2, %o5
-	add	%o5, %o3, %o5
-
-	bleu	%ncc, 4f
-	andn	%o5, 7, %o5		! 8 byte aligned count
-	neg	%o0, %o5		! 'large' case
-	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
-4:	
-	brgez,a	%o3, .beginmedloop
-	ldda	[%o1-8]%asi, %d0
-
-	add	%o1, %o3, %o1		! back up o1
-5:
-	ldda	[%o1]ASI_FL8_P, %d2
-	inc	%o1
-	andcc	%o1, 7, %g0
-	bnz	%ncc, 5b
-	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
-
-.beginmedloop:
-	tst	%o5
-	bz	%ncc, .endmedloop
-	sub	%o2, %o5, %o2		! update count for later
-
-	! Main loop to write out doubles.  Note: o5 & 7 == 0
-	
-	ldd	[%o1], %d2
-	subcc	%o5, 8, %o5		! update local count
-	bz,pn	%ncc, 1f
-	add	%o1, 8, %o1		! update SRC
-
-.medloop:
-	faligndata %d0, %d2, %d4
-	ldda	[%o1]%asi, %d0
-	subcc	%o5, 8, %o5		! update local count
-	add	%o1, 16, %o1		! update SRC
-	std	%d4, [%o0]
-	bz,pn	%ncc, 2f
-	faligndata %d2, %d0, %d6
-	ldda	[%o1 - 8]%asi, %d2
-	subcc	%o5, 8, %o5		! update local count
-	std	%d6, [%o0 + 8]
-	bnz,pt	%ncc, .medloop
-	add	%o0, 16, %o0		! update DST
-
-1:	
-	faligndata %d0, %d2, %d4
-	fmovd	%d2, %d0
-	std	%d4, [%o0]
-	ba	.endmedloop
-	add	%o0, 8, %o0
-	
-2:
-	std	%d6, [%o0 + 8]
-	sub	%o1, 8, %o1
-	add	%o0, 16, %o0
-	
-
-.endmedloop:
-	! Currently, o1 is pointing to the next double-aligned byte in SRC
-	! The 8 bytes starting at [o1-8] are available in d0
-	! At least one, and possibly all, of these need to be written.
-
-	cmp	%o2, BLOCK_SIZE	
-	bgu	%ncc, .large		! otherwise, less than 16 bytes left
-	
-#if 1
-
-	/* This code will use partial stores.  */
-
-	mov	%g0, %o5
-	and	%o3, 7, %o3		! Number of bytes needed to completely
-					! fill %d0 with good (unwritten) data.
-
-	subcc	%o2, 8, %o2		! update count (maybe too much)
-	movl	%ncc, %o2, %o5		
-	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
-	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)
-
-	bz	%ncc, 2f
-	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
-	
-1:
-	deccc	%o5
-	ldda	[%o1]ASI_FL8_P, %d2
-	inc	%o1
-	bgu	%ncc, 1b
-	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
-
-2:
-	not     %o3
-	faligndata %d0, %d0, %d0	! shift bytes to the left
-	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
-	edge8n	%g0, %o3, %o5
-	stda	%d0, [%o0]%o5, ASI_PST8_P
-	brlez	%o2, .exit_memcpy
-	add	%o0, %o3, %o0		! update DST to last stored byte
-3:	
-	inc	%o0
-	deccc	%o2
-	ldub	[%o1], %o3
-	stb	%o3, [%o0]
-	bgu	%ncc, 3b
-	inc	%o1
-
-#else
-
-	andcc	%o3, 7, %o5		! Number of bytes needed to completely
-					! fill %d0 with good (unwritten) data.
-	bz	%ncc, 2f
-	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
-	cmp	%o2, 8
-	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
-	add	%o1, %o3, %o1 		! Back up %o1
-
-1:
-	deccc	%o5
-	ldda	[%o1]ASI_FL8_P, %d2
-	inc	%o1
-	bgu	%ncc, 1b
-	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
-
-2:	
-	subcc	%o2, 8, %o2
-	std	%d0, [%o0]
-	bz	%ncc, .exit_memcpy
-	add	%o0, 8, %o0
-3:	
-	ldub	[%o1], %o3
-	deccc	%o2
-	inc	%o1
-	stb	%o3, [%o0]
-	bgu	%ncc, 3b
-	inc	%o0
-#endif	
-
-.exit_memcpy:
-        wr      %o4, %g0, %fprs		! fprs = o4   restore fprs
-	mov	%g5, %asi		! restore %asi
-	retl
-        mov     %g1, %o0
-
-	.align ICACHE_LINE_SIZE
-.large:
-	! The following test for BSTORE_SIZE is used to decide whether
-	! to store data with a block store or with individual stores.
-	! The block store wins when the amount of data is so large
-	! that it is causes other application data to be moved out
-	! of the L1 or L2 cache.
-	! On a Panther, block store can lose more often because block
-	! store forces the stored data to be removed from the L3 cache.
-	!
-	sethi	%hi(BSTORE_SIZE),%o5
-	or	%o5,%lo(BSTORE_SIZE),%o5
-	cmp	%o2, %o5
-	bgu	%ncc, .xlarge		
-
-	! %o0 I/O DST is 64-byte aligned
-	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
-	! %d0 I/O already loaded with SRC data from [%o1-8]
-	! %o2 I/O count (number of bytes that need to be written)
-	! %o3 I   Not written.  If zero, then SRC is double aligned.
-	! %o4 I   Not written.  Holds fprs.
-	! %o5   O The number of doubles that remain to be written.
-
-	! Load the rest of the current block 
-	! Recall that %o1 is further into SRC than %o0 is into DST
-
-	prefetch [%o0 + (0 * BLOCK_SIZE)], #n_writes
-	prefetch [%o0 + (1 * BLOCK_SIZE)], #n_writes
-	prefetch [%o0 + (2 * BLOCK_SIZE)], #n_writes
-	ldda	[%o1]%asi, %d2
-	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
-	ldda	[%o1 + 0x8]%asi, %d4
-	faligndata %d0, %d2, %d16
-	ldda	[%o1 + 0x10]%asi, %d6
-	faligndata %d2, %d4, %d18
-	ldda	[%o1 + 0x18]%asi, %d8
-	faligndata %d4, %d6, %d20
-	ldda	[%o1 + 0x20]%asi, %d10
-	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
-	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
-	faligndata %d6, %d8, %d22
-	ldda	[%o1 + 0x28]%asi, %d12
-	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
-	faligndata %d8, %d10, %d24
-	ldda	[%o1 + 0x30]%asi, %d14
-	faligndata %d10, %d12, %d26
-	ldda	[%o1 + 0x38]%asi, %d0
-	sub	%o2, BLOCK_SIZE, %o2	! update count
-	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
-	add	%o1, BLOCK_SIZE, %o1		! update SRC
-
-	! Main loop.  Write previous block.  Load rest of current block.
-	! Some bytes will be loaded that won't yet be written.
-1:	
-	ldda	[%o1]%asi, %d2
-	faligndata %d12, %d14, %d28
-	ldda	[%o1 + 0x8]%asi, %d4
-	faligndata %d14, %d0, %d30
-	std	%d16, [%o0]
-	std	%d18, [%o0+8]
-	std	%d20, [%o0+16]
-	std	%d22, [%o0+24]
-	std	%d24, [%o0+32]
-	std	%d26, [%o0+40]
-	std	%d28, [%o0+48]
-	std	%d30, [%o0+56]
-	sub	%o2, BLOCK_SIZE, %o2		! update count
-	prefetch [%o0 + (6 * BLOCK_SIZE)], #n_writes
-	prefetch [%o0 + (3 * BLOCK_SIZE)], #n_writes
-	add	%o0, BLOCK_SIZE, %o0		! update DST
-	ldda	[%o1 + 0x10]%asi, %d6
-	faligndata %d0, %d2, %d16
-	ldda	[%o1 + 0x18]%asi, %d8
-	faligndata %d2, %d4, %d18
-	ldda	[%o1 + 0x20]%asi, %d10
-	faligndata %d4, %d6, %d20
-	ldda	[%o1 + 0x28]%asi, %d12
-	faligndata %d6, %d8, %d22
-	ldda	[%o1 + 0x30]%asi, %d14
-	faligndata %d8, %d10, %d24
-	ldda	[%o1 + 0x38]%asi, %d0
-	faligndata %d10, %d12, %d26
-	cmp	%o2, BLOCK_SIZE + 8
-	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
-	bgu,pt	%ncc, 1b
-	add	%o1, BLOCK_SIZE, %o1	! update SRC
-	faligndata %d12, %d14, %d28
-	faligndata %d14, %d0, %d30
-	stda	%d16, [%o0]ASI_BLK_P	! store 64 bytes, bypass cache
-	cmp	%o2, BLOCK_SIZE
-	bne	%ncc, 2f		! exactly 1 block remaining?
-	add	%o0, BLOCK_SIZE, %o0	! update DST
-	brz,a	%o3, 3f			! is SRC double aligned?
-	ldd	[%o1], %d2
-
-2:	
-	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
-	add	%o5, %o3, %o5
-
-	ba	.beginmedloop
-	andn	%o5, 7, %o5		! 8 byte aligned count
-
-	! This is when there is exactly 1 block remaining and SRC is aligned
-3:
-	!  %d0 was loaded in the last iteration of the loop above, and
-	!  %d2 was loaded in the branch delay slot that got us here.
-	ldd	[%o1 + 0x08], %d4
-	ldd	[%o1 + 0x10], %d6
-	ldd	[%o1 + 0x18], %d8
-	ldd	[%o1 + 0x20], %d10
-	ldd	[%o1 + 0x28], %d12
-	ldd	[%o1 + 0x30], %d14
-	stda	%d0, [%o0]ASI_BLK_P
-
-	ba	.exit_memcpy
-	 nop
-
-
-	.align 16
-	! two nops here causes loop starting at 1f below to be
-	! on a cache line boundary, improving performance
-	nop
-	nop
-xlarge:
-.xlarge:
-	/*
-	set	4096, %l2
-	subcc	%o2, %l2, %g0
-	bge	%ncc, size_ge_4k
-	nop
-	*/
-	! %o0 I/O DST is 64-byte aligned
-	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
-	! %d0 I/O already loaded with SRC data from [%o1-8]
-	! %o2 I/O count (number of bytes that need to be written)
-	! %o3 I   Not written.  If zero, then SRC is double aligned.
-	! %o4 I   Not written.  Holds fprs.
-	! %o5   O The number of doubles that remain to be written.
-
-	! Load the rest of the current block 
-	! Recall that %o1 is further into SRC than %o0 is into DST
-
-	! prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
-	! executed in delay slot for branch to .xlarge
-	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
-	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
-	ldda	[%o1]%asi, %d2
-	prefetch [%o1 + (6 * BLOCK_SIZE)], #one_read
-	ldda	[%o1 + 0x8]%asi, %d4
-	faligndata %d0, %d2, %d16
-	ldda	[%o1 + 0x10]%asi, %d6
-	faligndata %d2, %d4, %d18
-	ldda	[%o1 + 0x18]%asi, %d8
-	faligndata %d4, %d6, %d20
-	ldda	[%o1 + 0x20]%asi, %d10
-	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
-	faligndata %d6, %d8, %d22
-	ldda	[%o1 + 0x28]%asi, %d12
-	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
-	faligndata %d8, %d10, %d24
-	ldda	[%o1 + 0x30]%asi, %d14
-	faligndata %d10, %d12, %d26
-	ldda	[%o1 + 0x38]%asi, %d0
-	sub	%o2, BLOCK_SIZE, %o2	! update count
-	prefetch [%o1 + (7 * BLOCK_SIZE)], #one_read
-	add	%o1, BLOCK_SIZE, %o1	! update SRC
-
-	! This point is 32-byte aligned since 24 instructions appear since
-	! the previous alignment directive.
-	
-
-	! Main loop.  Write previous block.  Load rest of current block.
-	! Some bytes will be loaded that won't yet be written.
-1:
-	ldda	[%o1]%asi, %d2
-	faligndata %d12, %d14, %d28
-	ldda	[%o1 + 0x8]%asi, %d4
-	faligndata %d14, %d0, %d30
-	stda	%d16, [%o0]ASI_BLK_P
-	sub	%o2, BLOCK_SIZE, %o2		! update count
-	ldda	[%o1 + 0x10]%asi, %d6
-	faligndata %d0, %d2, %d16
-	ldda	[%o1 + 0x18]%asi, %d8
-	faligndata %d2, %d4, %d18
-	ldda	[%o1 + 0x20]%asi, %d10
-	faligndata %d4, %d6, %d20
-	ldda	[%o1 + 0x28]%asi, %d12
-	faligndata %d6, %d8, %d22
-	ldda	[%o1 + 0x30]%asi, %d14
-	faligndata %d8, %d10, %d24
-	ldda	[%o1 + 0x38]%asi, %d0
-	faligndata %d10, %d12, %d26
-	! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
-	prefetch [%o1 + (8 * BLOCK_SIZE) + 8], #one_read
-	add	%o0, BLOCK_SIZE, %o0		! update DST
-	cmp	%o2, BLOCK_SIZE + 8
-	! second prefetch important to correct for occasional dropped
-	! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
-	! strong prefetch prevents drops on Panther, but Jaguar and earlier
-	! US-III models treat strong prefetches as weak prefetchs
-	! to avoid regressions on customer hardware, we retain the prefetch
-	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
-	bgu,pt	%ncc, 1b
-	add	%o1, BLOCK_SIZE, %o1	! update SRC
-
-	faligndata %d12, %d14, %d28
-	faligndata %d14, %d0, %d30
-	stda	%d16, [%o0]ASI_BLK_P	! store 64 bytes, bypass cache
-	cmp	%o2, BLOCK_SIZE		
-	bne	%ncc, 2f		! exactly 1 block remaining?
-	add	%o0, BLOCK_SIZE, %o0	! update DST
-	brz,a	%o3, 3f			! is SRC double aligned?
-	ldd	[%o1], %d2
-
-2:	
-	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
-	add	%o5, %o3, %o5
-
-
-	ba	.beginmedloop
-	andn	%o5, 7, %o5		! 8 byte aligned count
-
-
-	! This is when there is exactly 1 block remaining and SRC is aligned
-3:
-	!  %d0 was loaded in the last iteration of the loop above, and
-	!  %d2 was loaded in the branch delay slot that got us here.
-	ldd	[%o1 + 0x08], %d4
-	ldd	[%o1 + 0x10], %d6
-	ldd	[%o1 + 0x18], %d8
-	ldd	[%o1 + 0x20], %d10
-	ldd	[%o1 + 0x28], %d12
-	ldd	[%o1 + 0x30], %d14
-	stda	%d0, [%o0]ASI_BLK_P
-
-	ba	.exit_memcpy
-	 nop
-
-copying_ge_512:
-	mov	%o0, %o5	! save dst address for return value.
-	! both src and dst are aligned to 8 byte boundary.
-	save	%sp, -SA(STACK_OFFSET + SAVESIZE), %sp
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov	%i2, %o2
-	mov	%i3, %o3
-	mov	%i5, %o5
-#ifndef	__sparcv9
-	stx	%g4, [%sp + STACK_OFFSET + g4_offset]
-	stx	%g5, [%sp + STACK_OFFSET + g5_offset]
-#endif
-	rd	%fprs, %g5		! check for unused fp
-	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
-	bz,a	%ncc, 1f
-	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
-1:
-	!predfetch src buf
-	sub     %o1,1,%o3
-	andn    %o3,0x7f,%l1
-	add     %l1,128,%l1
-	prefetch [%l1],2		!prefetch next 128b
-	prefetch [%l1+64],2
-	prefetch [%l1+(2*64)],2		!cont from above
-	prefetch [%l1+(3*64)],2
-	!predfetch dst buf
-	sub     %o5,1,%o3
-	andn    %o3,0x7f,%l1
-	add     %l1,128,%l1
-	prefetch [%l1],2		!prefetch next 128b
-	prefetch [%l1+64],2
-	prefetch [%l1+(2*64)],2		!cont from above
-	prefetch [%l1+(3*64)],2
-
-	andcc   %o5,0x7f,%o3	    !o3=0 , means it is already 128 align
-	brz,pn  %o3,aligned_on_128
-	sub     %o3,128,%o3
-
-	add     %o2,%o3,%o2
-align_to_128:
-	ldxa	[%o1]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o1,8,%o1		! increment src pointer
-	stxa    %o4,[%o5]ASI_CACHE_SPARING_PRIMARY
-	addcc   %o3,8,%o3
-	bl,pt   %ncc,align_to_128
-	add     %o5,8,%o5		! increment dst pointer
-
-aligned_on_128:
-	andcc	%o5,0x1ff,%o3	!%o3=0 when it is 512 b aligned.
-	brnz,pn	%o3, 4f
-	mov	%o2,%l4		!l4=count from 512 align
-	set	4096, %l2
-	subcc	%o2, %l2, %g0
-	bge,pn	%ncc, stingray_optimized_copy
-	nop
-4:
-
-	sub	%o5,8,%l6	!should be in current 512 chunk
-	andn 	%l6,0x1ff,%o3	!%o3=aligned 512b addr
-	add 	%o3,0x200,%o3	!%o3=next aligned 512b addr to start
-				! stingray_optimized_copy
-	sub 	%o3,%o5,%o3	!o3=how many byte in the current remaining chunk
-	sub	%o2,%o3,%l4	!l4=count from 512 align
-	/*
-	 * if l4 is < 4096 do interleave_128_copy only.
-	 */
-	set	4096, %l2
-	subcc	%l4, %l2, %g0
-	bge,pn	%ncc,6f
-	nop
-	mov	%g0, %l4
-	add	%o5, %o2, %l1
-	ba	interleave_128_copy
-	nop
-6:
-	mov	%o3, %o2
-	subcc 	%o3,256,%g0	! if it is > 256 bytes , could use the
-				! interleave_128_copy
-	bl,pn	%ncc,copy_word	! o.w use copy_word to finish the 512 byte
-				! alignment.
-	!%o1=64 bytes data
-	!%o5=next 8 byte addr to write
-	!%o2=new count i.e how many bytes to write
-	add     %o5,%o2,%l1	!cal the last byte to write %l1
-	ba	interleave_128_copy
-	nop
-
-	.align	64
-interleave_128_copy:
-	! %l1 has the addr of the dest. buffer at or beyond which no write
-	! is to be done.
-	! %l4 has the number of bytes to zero using stingray_optimized_bzero
-	!prefetch src
-	!prefetch src 
-
-	add	%o1, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o1, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o1, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o1, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-
-	!prefetch dst 
-
-	add	%o5, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o5, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o5, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o5, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-
-	ldxa	[%o1]ASI_CACHE_SPARING_PRIMARY, %o4
-	stxa     %o4,[%o5]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-	add	%o1, 128, %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, 128, %o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-	add     %o1, (1 * 8), %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add	%o5, (1 * 8), %o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (1 * 8 + 128), %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (1 * 8 + 128), %o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (2 * 8),%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (2 * 8),%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (2 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (2 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (3 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (3 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (3 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (3 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (4 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (4 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (4 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (4 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (5 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (5 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (5 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (5 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (6 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (6 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (6 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (6 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (7 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (7 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (7 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (7 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (8 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (8 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (8 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (8 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (9 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (9 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (9 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (9 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (10 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (10 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (10 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (10 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (11 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (11 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (11 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (11 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (12 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (12 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (12 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (12 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (13 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (13 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (13 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (13 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (14 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (14 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (14 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (14 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (15 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (15 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (15 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (15 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add	%o1, 256, %o1
-	! check if the next 256 byte copy will not exceed the number of
-	! bytes remaining to be copied.
-	! %l2 points to the dest buffer after copying 256 bytes more.
-	! %l1 points to dest. buffer at or beyond which no writes should be done.
-	add     %o5,512,%l2
-			
-	subcc   %l1,%l2,%g0
-	bge,pt  %ncc,interleave_128_copy
-	add     %o5,256,%o5
-
-copy_word:
-	and     %o2,255,%o3
-	and     %o3,7,%o2
-
-	! Set the remaining doubles
-	subcc   %o3, 8, %o3		! Can we store any doubles?
-	bl,pn  %ncc, 6f
-	and	%o2, 7, %o2		! calc bytes left after doubles
-
-	!prefetch src 
-
-	mov	%o1, %o4
-	prefetch [%o4], 2	!1st 64 byte line of next 256 byte block
-	add	%o1, 128, %o4
-	prefetch [%o4], 2	!3rd 64 byte line of next 256 byte block
-	add	%o1, 64, %o4
-	prefetch [%o4], 2	!2nd 64 byte line of next 256 byte block
-	add	%o1, 192, %o4
-	prefetch [%o4], 2	!4th 64 byte line of next 256 byte block
-
-	!prefetch dst 
-
-	mov	%o5, %o4
-	prefetch [%o4], 2	!1st 64 byte line of next 256 byte block
-	add	%o5, 128, %o4
-	prefetch [%o4], 2	!3rd 64 byte line of next 256 byte block
-	add	%o5, 64, %o4
-	prefetch [%o4], 2	!2nd 64 byte line of next 256 byte block
-	add	%o5, 192, %o4
-	prefetch [%o4], 2	!4th 64 byte line of next 256 byte block
-
-5:	
-	ldxa	[%o1]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o1, 8, %o1      
-	stxa	%o4, [%o5]ASI_CACHE_SPARING_PRIMARY
-	subcc   %o3, 8, %o3
-	bge,pt	%ncc, 5b
-	add     %o5, 8, %o5      
-6:
-	! Set the remaining bytes
-	brz	%o2,  can_we_do_stingray_optimized_copy
-	nop
-	
-	! Terminate the copy with a partial store.
-	! The data should be at d0
-	ldxa	[%o1]ASI_CACHE_SPARING_PRIMARY, %o4
-	stx	%o4, [%sp + STACK_OFFSET + scratch_offset]
-	ldd	[%sp + STACK_OFFSET + scratch_offset], %d0
-	
-	dec     %o2		     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-can_we_do_stingray_optimized_copy:
-	mov	%l4, %o2
-	brnz,pn	%o2, stingray_optimized_copy
-	nop
-	
-exit:	
-	brnz	%g5, 1f
-	nop
-	wr	%g5, %g0, %fprs
-1:
-#ifndef	__sparcv9
-	ldx	[%sp + STACK_OFFSET + g4_offset], %g4
-	ldx	[%sp + STACK_OFFSET + g5_offset], %g5
-#endif
-	ret				! %o0 was preserved
-	restore
-
-
-stingray_optimized_copy:
-!%o5 = next memory addr which is 512 b align
-!%l4 = remaining byte from 512 align.
-
-	add	%o5, %l4, %o2
-
-	prefetch [%o1+0],2
-	prefetch [%o1+(64*1)],2
-	prefetch [%o1+(64*2)],2
-	prefetch [%o1+(64*3)],2
-	prefetch [%o1+(64*4)],2
-	prefetch [%o1+(64*5)],2
-	prefetch [%o1+(64*6)],2
-	prefetch [%o1+(64*7)],2
-	prefetch [%o1+(64*8)],2
-	prefetch [%o1+(64*9)],2
-	prefetch [%o1+(64*10)],2
-	prefetch [%o1+(64*11)],2
-	prefetch [%o1+(64*12)],2
-	prefetch [%o1+(64*13)],2
-	prefetch [%o1+(64*14)],2
-	prefetch [%o1+(64*15)],2
-
-	prefetch [%o5+0],2
-	prefetch [%o5+(64*1)],2
-	prefetch [%o5+(64*2)],2
-	prefetch [%o5+(64*3)],2
-	prefetch [%o5+(64*4)],2
-	prefetch [%o5+(64*5)],2
-	prefetch [%o5+(64*6)],2
-	prefetch [%o5+(64*7)],2
-	prefetch [%o5+(64*8)],2
-	prefetch [%o5+(64*9)],2
-	prefetch [%o5+(64*10)],2
-	prefetch [%o5+(64*11)],2
-	prefetch [%o5+(64*12)],2
-	prefetch [%o5+(64*13)],2
-	prefetch [%o5+(64*14)],2
-	prefetch [%o5+(64*15)],2
-	
-	ba      myloop2
-	srl	%l4, 12, %l4
-	
-	! Local register usage:
-	!
-	! %l1 address at short distance ahead of current %o1 for prefetching
-	!     into L1 cache. 
-	! %l2 address at far ahead of current %o1 for prefetching into L2 cache.
-	! %l3 save %o5 at start of inner loop. 
-	! %l4 Number of 4k blocks to copy
-	! %g1 save %o1 at start of inner loop. 
-	! %l5 iteration counter to make buddy loop execute 2 times. 
-	! %l6 iteration counter to make inner loop execute 32 times. 
-	! %l7 address at far ahead of current %o5 for prefetching destination
-	!     into L2 cache.
-	       
-.align 64
-myloop2:
-	set      2,%l5	! %l5 is the loop count for the buddy loop, for 2 buddy lines.
-	add      %o5, 0, %l3 
-	add      %o1, 0, %g1 
-buddyloop:
-	set      PF_FAR, %g4	! Prefetch far ahead. CHANGE FAR PREFETCH HERE.
-	add      %o1, %g4, %l2	! For prefetching far ahead, set %l2 far ahead
-				! of %o1
-	add      %o1, PF_NEAR, %l1	! For prefetching into L1 D$, set %l1 a
-					! little ahead of %o1
-	add      %o5, %g4, %l7	! For prefetching far ahead, set %l7 far ahead
-				! of %o5
-
-	add      %l2, %g4, %g4	! %g4 is now double far ahead of the source
-				! address in %o1.
-	prefetch [%g4+%g0],2	! Prefetch ahead by several pages to get TLB
-				! entry in advance.
-	set      2*PF_FAR, %g4	! Prefetch double far ahead.  SET DOUBLE FAR
-				! PREFETCH HERE.
-	add      %o5, %g4, %g4	! %g4 is now double far ahead of the dest
-				! address in %o5.
-	prefetch [%g4+%g0],2	! Prefetch ahead by 2 pages to get TLB entry
-				! in advance.
-
-	set      4,%l6		! %l6 = loop count for the inner loop,
-				! for 4 x 8 = 32 lines.
-	set      0, %g4
-	
-	! Each iteration of the inner loop below copies 8 sequential lines.
-	! This loop is iterated 4 times, to move a total of 32 lines,
-	! all of which have the same value of PA[9], so we increment the base
-	! address by 1024 bytes in each iteration, which varies PA[10].				     */ 
-innerloop:	  
-	/* ---- copy line 1 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 2 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 3 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 4 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 5 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 6 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 7 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 8 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-
-	subcc   %l6,1,%l6	  /* Decrement the inner loop counter.	 */
-	
-	! Now increment by 64 + 512 so we don't toggle PA[9]
-	add     %g4, 576, %g4
-	add     %o5, 576, %o5
-
-	bg,pt   %icc,innerloop
-	add     %o1, 576, %o1	! increment %o1 for the next source line.
-	! END OF INNER LOOP
-
-
-	subcc   %l5,1,%l5
-	add     %l3, 512, %o5	! increment %o5 to first buddy line of dest.
-	bg,pt   %icc,buddyloop
-	add     %g1, 512 ,%o1	! Set %o1 to the first of the odd buddy lines.
-
-	subcc   %l4, 1, %l4
-	add     %o5, 3584, %o5	! Advance both base addresses to 4k above where
-				! they started.
-	add     %o1, 3584, %o1	! They were already incremented by 512,
-				! so just add 3584.
-
-	bg,pt   %icc,myloop2
-	nop
-
-	/****larryalg_end_here*************/
-
-	sub	%o2,%o5,%o2	!how many byte left
-	brz,pn	%o2,complete_write
-	mov	%g0,%l4
-	add     %o5,%o2,%l1	     !cal the last byte to write %l1
-	subcc	%o2,256,%g0
-	bge,pt	%ncc,interleave_128_copy
-	mov	%g0,%l4
-	
-	ba	copy_word
-	nop
-
-
-complete_write: 
-	ba      exit
-	nop
-
-
-	
-	SET_SIZE(memcpy)
-	SET_SIZE(__align_cpy_1)
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s
deleted file mode 100644
index f9e0f62ac9..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s
+++ /dev/null
@@ -1,767 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-
-	.file	"memset.s"
-/*
- * char *memset(sp, c, n)
- *
- * Set an array of n chars starting at sp to the character c.
- * Return sp.
- *
- * Fast assembler language version of the following C-program for memset
- * which represents the `standard' for the C-library.
- *
- *	void *
- *	memset(void *sp1, int c, size_t n)
- *	{
- *	    if (n != 0) {
- *		char *sp = sp1;
- *		do {
- *		    *sp++ = (char)c;
- *		} while (--n != 0);
- *	    }
- *	    return (sp1);
- *	}
- */
-
-#include <sys/asm_linkage.h>
-#include <sys/sun4asi.h>
-
-	ANSI_PRAGMA_WEAK(memset,function)
-
-#define	SAVESIZE	(8 * 1)
-#ifdef	__sparcv9
-#define	STACK_OFFSET	(STACK_BIAS + 0)
-#else
-#define	STACK_OFFSET	(STACK_BIAS + 0 + 0)
-#endif
-#define	scratch_offset	0
-
-#define ASI_CACHE_SPARING_PRIMARY 0xf4
-#define	ALIGN8(X)	(((X) + 7) & ~7)
-#define	ICACHE_LINE_SIZE	64
-#define	FPRS_FEF	0x4
-#define	PF_FAR		2048
-
-	.section        ".text"
-	.align ICACHE_LINE_SIZE
-
-	/*
-	 * Optimizations done:
-	 *
-	 * No stores in delay slot of branch instructions.
-	 * conditional stores where possible
-	 * prefetch before doing stxa
-	 * Bank interleaved writing.
-	 */
-
-	ENTRY(memset)
-	add	%sp, -SA(STACK_OFFSET + SAVESIZE), %sp
-	mov	%o0, %o5		! copy sp1 before using it
-	/*
-	 * If 0 bytes to xfer return
-	 */
-	brnz	%o2, continue
-	nop
-	retl
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-continue:
-	/*
-	 * If the count is multiple of 8 and buffer is aligned to 8
-	 * we don't have to look at fprs
-	 */
-	or	%o5, %o2, %o3
-	and	%o3, 7, %o3
-        brnz	%o3, check_fprs
-	mov	4, %g1
-	prefetch	[%o5],2
-	ba	skip_rd_fprs
-	nop
-	
-check_fprs:
-        rd      %fprs, %g1              ! g1 = fprs
-skip_rd_fprs:
-	prefetch	[%o5],2
-	andcc	%g1, 0x4, %g1		! fprs.du = fprs.dl = 0
-	bnz	%ncc, 1f		! Is fprs.fef == 1
-	nop
-        wr      %g0, FPRS_FEF, %fprs	! fprs.fef = 1
-1:
-	and	%o1, 0xff, %o1		! o1 is (char)c
-	sll     %o1, 8, %o3
-        or      %o1, %o3, %o1		! now o1 has 2 bytes of c
-        sll     %o1, 16, %o3
-        or      %o1, %o3, %o1		! now o1 has 4 bytes of c
-	sllx	%o1, 32, %o3
-	or	%o1, %o3, %o1		! now o1 has 8 bytes of c
-	stx	%o1, [%sp + STACK_OFFSET + scratch_offset]
-	ldd	[%sp + STACK_OFFSET + scratch_offset], %d0
-	cmp	%o2, 8
-	bge,pt	%ncc, xfer_8_or_more
-	mov	%o0, %o5
-	/*
-	 * Do a partial store of %o2 bytes
-	 */
-        andcc	%o5, 7, %o3		! is sp1 aligned on a 8 byte bound
-        brz,pt	%o3, aligned_on_8		
-        sub	%o5, %o3, %o5		! align the  destination buffer.
-	mov	%o3, %o1
-	mov	8, %o4
-	sub 	%o4, %o3, %o3
-	cmp	%o3, %o2
-	bg,a,pn	%ncc, 1f
-	mov	%o2, %o3	
-1:
-	! %o3 has the bytes to be written in partial store.
-	sub	%o2, %o3, %o2
-	dec	%o3
-	prefetch	[%o5],2
-	edge8n	%g0, %o3, %o4
-	srl	%o4, %o1, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-	brz	%o2, simple_ret
-	add	%o5, 8, %o5
-aligned_on_8:
-	prefetch	[%o5],2
-        dec     %o2                     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-	brnz	%g1, 1f			! was fprs.fef == 1
-	nop
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-	retl
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-xfer_8_or_more:
-        andcc	%o5, 7, %o3		! is sp1 aligned on a 8 byte bound
-        brz,pt	%o3, blkchk		
-        sub	%o5, %o3, %o5		! align the  destination buffer.
-        sub	%o3, 8, %o3		! -(bytes till double aligned)
-        add	%o2, %o3, %o2		! update o2 with new count
-	xor	%o3, 0xff, %o3
-	and	%o3, 7, %o3
-	prefetch	[%o5],2
-	edge8ln	%g0, %o3, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-	add	%o5, 8, %o5
-
-
-	! Now sp1 is double aligned (sp1 is found in %o5)
-blkchk:
-	cmp     %o2, 767		! if large count use Block ld/st
-	bg,pt	%ncc,blkwr
-	nop
-
-	
-	and	%o2, 24, %o3		! o3 is {0, 8, 16, 24}
-
-	brz	%o3, skip_dw_loop
-	nop
-
-1:	subcc	%o3, 8, %o3		! double-word loop
-	stx	%o1, [%o5]
-	bgu,pt %ncc, 1b
-	add	%o5, 8, %o5
-skip_dw_loop:
-	andncc	%o2, 31, %o4		! o4 has 32 byte aligned count
-	brz,pn	%o4, 3f
-	nop
-	ba	loop_32byte
-	nop
-
-	.align	ICACHE_LINE_SIZE
-
-loop_32byte:
-	subcc	%o4, 32, %o4		! main loop, 32 bytes per iteration
-	stx	%o1, [%o5]
-	stx	%o1, [%o5 + 8]
-	stx	%o1, [%o5 + 16]
-	stx	%o1, [%o5 + 24]
-	bne,pt  %ncc, loop_32byte
-	add	%o5, 32, %o5
-3:	
-	and	%o2, 7, %o2		! o2 has the remaining bytes (<8)
-	brz	%o2, skip_partial_copy
-	nop
-
-	! Terminate the copy with a partial store.
-	! The data should be at d0
-	prefetch	[%o5],2
-        dec     %o2                     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-
-skip_partial_copy:
-simple_ret:
-	brz,a	%g1, 1f			! was fprs.fef == 0
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-	retl
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-blkwr:
-        sub     %o5,1,%o3
-        andn    %o3,0x7f,%o4
-        add     %o4,128,%o4
-        prefetch [%o4],2		!prefetch next 128b
-        prefetch [%o4+64],2
-        prefetch [%o4+(2*64)],2		!cont from above
-        prefetch [%o4+(3*64)],2
-
-        andcc   %o5,0x7f,%o3            !o3=0 , means it is already 128 align
-        brz,pn  %o3,alreadyalign128
-        sub     %o3,128,%o3
-
-        add     %o2,%o3,%o2
-align128:
-        stxa    %o1,[%o5]ASI_CACHE_SPARING_PRIMARY
-        addcc   %o3,8,%o3
-        bl,pt   %ncc,align128
-        add     %o5,8,%o5
-
-
-
-alreadyalign128:
-	andcc	%o5,0x1ff,%o3	!%o3=0 when it is 512 b aligned.
-	brnz,pn	%o3, 4f
-	mov	%o2,%g5		!g5=count from 512 align
-	set	4096, %o4
-	subcc	%o2, %o4, %g0
-	bge,pn	%ncc, larry_alg
-	nop
-4:
-
-	sub	%o5,8,%o4	!should be in current 512 chunk
-	andn 	%o4,0x1ff,%o3	!%o3=aligned 512b addr
-	add 	%o3,0x200,%o3	!%o3=next aligned 512b addr which start larry process
-	sub 	%o3,%o5,%o3	!o3=how many byte in the current remaining chunk
-	sub	%o2,%o3,%g5	!g5=count from 512 align
-	/*
-	 * if g5 is < 4096 do start_128 only.
-	 */
-	set	4096, %o4
-	subcc	%g5, %o4, %g0
-	bge,pn	%ncc,6f
-	nop
-	mov	%g0, %g5
-	add	%o5, %o2, %o4
-	ba	start_128
-	nop
-6:
-	mov	%o3, %o2
-	subcc 	%o3,256,%g0	!if it is > 256 bytes , could use the st-interleave alg to wr
-	bl,pn	%ncc,storeword	!o.w use storeword to finish the 512 byte alignment.
-        !%o1=64 bytes data
-        !%o5=next 8 byte addr to write
-        !%o2=new count i.e how many bytes to write
-        add     %o5,%o2,%o4             !cal the last byte to write %o4
-	ba	start_128
-	nop
-
-	.align	64
-start_128:
-	add	%o5, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o5, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o5, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o5, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-	mov	%o5, %o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-        add     %o5,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-        add     %o5,8,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(2 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128 ,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(3 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(4 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(5 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(6 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(7 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(8 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(9 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(10 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(11 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(12 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(13 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(14 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(15 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,512,%o3  !%o3=final byte of next 256 byte, to check if more 256 byte block ahead
-        subcc   %o4,%o3,%g0   !%o4=final byte location;%o3=final byte of next 256 byte block
-        bge,pt  %ncc,start_128    !branch taken means next 256 byte block is still within the limit.
-        add     %o5,256,%o5
-
-!need to connect the rest of the program
-storeword:
-        and     %o2,255,%o3
-        and     %o3,7,%o2
-
-	! Set the remaining doubles
-	subcc   %o3, 8, %o3		! Can we store any doubles?
-	bl,pn  %ncc, 6f
-	and	%o2, 7, %o2		! calc bytes left after doubles
-
-5:	
-	stxa	%o1, [%o5]ASI_CACHE_SPARING_PRIMARY
-	subcc   %o3, 8, %o3
-	bge,pt	%ncc, 5b
-        add     %o5, 8, %o5      
-6:
-	! Set the remaining bytes
-	brz	%o2,  check_larry_alg		! safe to check all 64-bits
-	
-	! Terminate the copy with a partial store.
-	! The data should be at d0
-        dec     %o2                     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-check_larry_alg:
-	mov	%g5, %o2
-	brnz,pn	%o2, larry_alg
-	nop
-	
-.exit:	
-	brz,a	%g1, 1f			! was fprs.fef == 0
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-        retl				! %o0 was preserved
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-larry_alg:
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-	save	%sp, -SA(MINFRAME), %sp
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov	%i2, %o2
-	mov	%i3, %o3
-	mov	%i5, %o5
-!%o5 = next memory addr which is 512 b align
-!%g5 = remaining byte from 512 align.
-init:
-	set     4096,%g6
-
-        prefetch [%o5+0],2
-        prefetch [%o5+(64*1)],2
-        prefetch [%o5+(64*2)],2
-        prefetch [%o5+(64*3)],2
-        prefetch [%o5+(64*4)],2
-        prefetch [%o5+(64*5)],2
-        prefetch [%o5+(64*6)],2
-        prefetch [%o5+(64*7)],2
-        prefetch [%o5+(64*8)],2
-        prefetch [%o5+(64*9)],2
-        prefetch [%o5+(64*10)],2
-        prefetch [%o5+(64*11)],2
-        prefetch [%o5+(64*12)],2
-        prefetch [%o5+(64*13)],2
-        prefetch [%o5+(64*14)],2
-        prefetch [%o5+(64*15)],2
-        ba      myloop2
-	add     %o5,%g5,%g5
-        /* Local register usage:
-           %l3   save %o5 at start of inner loop.
-           %l5   iteration counter to make buddy loop execute 2 times.
-           %l6   iteration counter to make inner loop execute 32 times.
-           %l7   address at far ahead of current %o5 for prefetching destination into L2 cache.
-	 */
-
-	.align 64
-myloop2:
-	/* Section 1 */
-        set      2,%l5    /* %l5 is the loop count for the buddy loop, for 2 buddy lines.  */
-        add      %o5, 0, %l3
-buddyloop:
-        set      PF_FAR, %l4        /* Prefetch far ahead.             CHANGE FAR PREFETCH HERE.     <<==== */
-        add      %o5, %l4, %l7      /* For prefetching far ahead, set %l7 far ahead of %o5           */
-
-        set      2*PF_FAR, %l4      /* Prefetch double far ahead.  SET DOUBLE FAR PREFETCH HERE.     <<==== */
-        add      %o5, %l4, %l4      /* %l4 is now double far ahead of the dest address in %o5.       */
-        prefetch [%l4+%g0],2        /* Prefetch ahead by 2 pages to get TLB entry in advance.        */
-
-        set      4,%l6             /* %l6 = loop count for the inner loop, for 4 x 8 = 32 lines.     */
-        set      0, %l4
-
-
-/* Each iteration of the inner loop below writes 8 sequential lines.  This loop is iterated 4 times,
-   to move a total of 32 lines, all of which have the same value of PA[9], so we increment the base
-   address by 1024 bytes in each iteration, which varies PA[10].                                     */
-innerloop:
-	add	%o5, PF_FAR, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2 
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-
-	mov	%o5, %o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-        add     %o5,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-        add     %o5,8,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(2 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128 ,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(3 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(4 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(5 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(6 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(7 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(8 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(9 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(10 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(11 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(12 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(13 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(14 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(15 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-
-        add     %o5,256,%o5
-
-	mov	%o5, %o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-        add     %o5,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-        add     %o5,8,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(2 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128 ,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(3 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(4 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(5 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(6 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(7 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(8 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(9 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(10 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(11 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(12 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(13 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(14 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(15 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-
-        subcc   %l6,1,%l6    /* Decrement the inner loop counter.         */
-
-        /* -------- Now increment by 256 + 512 so we don't toggle PA[9] -------- */
-        add     %o5, 768, %o5
-
-        bg,pt   %ncc,innerloop
-        nop
-/* ------------------------ END OF INNER LOOP -------------------------- */
-
-        subcc   %l5,1,%l5
-        add     %l3, 512, %o5       /* increment %o5 to first buddy line of dest.   */
-        bg,pt   %ncc,buddyloop
-	nop
-        add     %o5, 3584, %o5      /* Advance both base addresses to 4k above where they started. */
-                                        !%o5=next 4096 block.
-	add %o5,%g6,%i5
-	subcc %g5,%i5,%g0
-        bge,pt   %ncc,myloop2
-        nop
-
-
-	/****larryalg_end_here*************/
-
-	sub	%g5,%o5,%o2	!how many byte left
-	brz,pn	%o2,complete_write
-	mov	%g0,%g5
-	add     %o5,%o2,%o4             !cal the last byte to write %o4
-	subcc	%o2,256,%g0
-	bge,pt	%ncc,memset_128
-	mov	%g0,%g5
-	
-	ba	memset_storeword
-	nop
-
-
-complete_write: 
-	brz,a	%g1, 1f			! was fprs.fef == 0
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-        ret				! %o0 was preserved
-	restore
-
-	.align	64
-memset_128:
-	add	%o5, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o5, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o5, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o5, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-	mov	%o5, %o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-        add     %o5,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-        add     %o5,8,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(2 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128 ,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(3 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(4 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(5 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(6 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(7 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(8 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(9 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(10 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(11 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(12 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(13 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(14 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(15 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,512,%l4  !%l4=final byte of next 256 byte, to check if more 256 byte block ahead
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-!this branch condition is not needed if we are handling bytes before 4096b
-!because we will only issue once, so %l6 is an invalid data
-!the branch is really for handling bytes after 4096b, there could be
-!multiple of 256 byte block to work on. 
-
-        subcc   %o4,%l4,%g0   !%o4=final byte location;%l4=final byte of next 256 byte block
-        bge,pt  %ncc,memset_128    !branch taken means next 256 byte block is still within the limit.
-        add     %o5,256,%o5
-
-!need to connect the rest of the program
-memset_storeword:
-        and     %o2,255,%o3
-        and     %o3,7,%o2
-
-	! Set the remaining doubles
-	subcc   %o3, 8, %o3		! Can we store any doubles?
-	bl,pn  %ncc, 6f
-	and	%o2, 7, %o2		! calc bytes left after doubles
-
-5:	
-	stxa	%o1, [%o5]ASI_CACHE_SPARING_PRIMARY
-	subcc   %o3, 8, %o3
-	bge,pt	%ncc, 5b
-        add     %o5, 8, %o5      
-6:
-	! Set the remaining bytes
-	brz	%o2,  complete_write		! safe to check all 64-bits
-	
-	! Terminate the copy with a partial store.
-	! The data should be at d0
-        dec     %o2                     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-	
-	brz,a	%g1, 1f			! was fprs.fef == 0
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-        ret				! %o0 was preserved
-	restore
-
-
-	SET_SIZE(memset)
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s
deleted file mode 100644
index c2b4aa4c29..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/asm_linkage.h>
-
-	ENTRY(_rock_pause)
-	membar	#Halt
-	retl
-	nop
-	SET_SIZE(_rock_pause)
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s
deleted file mode 100644
index a9861b1a86..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-	.file	"strcpy.s"
-
-/*
- * strcpy(s1, s2)
- *
- * Copy string s2 to s1.  s1 must be large enough. Return s1.
- *
- * Fast assembler language version of the following C-program strcpy
- * which represents the `standard' for the C-library.
- *
- *	char *
- *	strcpy(s1, s2)
- *	register char *s1;
- *	register const char *s2;
- *	{
- *		char *os1 = s1;
- *
- *		while(*s1++ = *s2++)
- *			;
- *		return(os1);
- *	}
- *
- */
-
-#include <sys/asm_linkage.h>
-
-	! This implementation of strcpy works by first checking the
-	! source alignment and copying byte, half byte, or word
-	! quantities until the source ptr is aligned at an extended
-	! word boundary.  Once this has occurred, the string is copied,
-	! checking for zero bytes, depending upon its dst ptr alignment.
-	! (methods for xword, word, half-word, and byte copies are present)
-
-#ifdef	__sparcv9
-#define	SAVESIZE	(8 * 3)
-#define	STACK_OFFSET	(STACK_BIAS + MINFRAME)
-#else
-#define	SAVESIZE	(8 * 5)
-#define	STACK_OFFSET	(STACK_BIAS + MINFRAME + 4)
-#endif
-
-#define LABEL_ADDRESS(label, reg)	 \
-	.pushlocals			;\
-0:	rd %pc, reg		  	;\
-	add reg, (label) - 0b, reg	;\
-	.poplocals
-
-offset_table:
-	.word	.storexword - offset_table	! Offset 0 => xword aligned
-	.word	.storebyte1241 - offset_table	! Offset 1 or 5
-	.word	.storehalfword - offset_table	! Offset 2 or 6
-	.word	.storebyte1421 - offset_table	! Offset 3 or 7
-	.word	.storeword - offset_table	! Offset 4
-
-	.align	64
-#ifdef	__sparcv9
-	.skip	20
-#else
-	.skip	12
-#endif
-
-	ENTRY(strcpy)
-	add	%sp, -SA(STACK_OFFSET + SAVESIZE), %sp
-#ifndef	__sparcv9
-	stx	%g4, [%sp + STACK_OFFSET + 24]
-	stx	%g5, [%sp + STACK_OFFSET + 32]
-#endif
-	sethi	%hi(0x01010101), %o4		! 0x01010000
-	sub	%o1, %o0, %o3		! src - dst
-	or	%o4, %lo(0x01010101), %o4	! 0x01010101
-	andcc	%o1, 7, %g5		! dword aligned ?
-	sllx	%o4, 32, %o5			! 0x01010101 << 32
-	mov	%o0, %o2		! save dst
-	or	%o4, %o5, %o4			! 0x0101010101010101
-
-	bz,pt	%ncc, .srcaligned	! yup
-	sllx	%o4, 7, %o5			! 0x8080808080808080
-
-	sub	%g0, %g5, %g4		! count = -off
-	ldx	[%o1 + %g4], %o1	! val = *(addr + -off)
-	mov	-1, %g1			! mask = -1
-	sllx	%g5, 3, %g4		! shift = off * 8
-	srlx	%g1, %g4, %g1		! -1 >> ((addr & 7) * 8)
-	orn	%o1, %g1, %o1		! val |= ~mask
-
-	andn	%o5, %o1, %g4		! ~val & 0x80
-	sub	%o1, %o4, %g1		! val - 0x01
-	andcc	%g4, %g1, %g4		! ~val & 0x80 & (val - 0x01)
-
-	sllx	%g5, 3, %g4
-	add	%o2, 8, %o2		! .zerobyte expects address = address + 8
-	bnz,a,pn	%xcc, .zerobyte ! Zero byte in the first xword
-	  sllx	%o1, %g4, %o1		! and data to be left justified
-
-	sub	%o2, 8, %o2
-	mov	8, %g4
-	sub	%g4, %g5, %g1		! Bytes to be written
-	sub	%g1, 1, %g4
-
-1:	stub	%o1, [%o2 + %g4]
-	dec	%g4
-	brgez,pt	%g4, 1b
-	srlx	%o1, 8, %o1
-
-	add	%o2, %g1, %o2		! Move ptr by #bytes written
-
-.srcaligned:
-	!! Check if the first dword contains zero after src is aligned
-	ldx	[%o2 + %o3], %o1	! x = src[]
-	andn	%o5, %o1, %g1		! ~x & 0x8080808080808080
-	sub	%o1, %o4, %g4		! x - 0x0101010101010101
-	andcc	%g4, %g1, %g0		! ((x - 0x0101010101010101) & ~x & 0x8080808080808080)
-	bnz,a,pn	%xcc, .zerobyte	! x has zero byte, handle end cases
-	  add	%o2, 8, %o2		! src += 8, dst += 8
-
-	!! Determine the destination offset and branch
-	!! to appropriate location
-	and	%o2, 3, %g4
-	and	%o2, 4, %g1
-	or	%g1, %g4, %g1
-	movrnz	%g4, 0, %g1
-	movrnz	%g1, 4, %g4
-
-	!! %g4 contains the index of the jump address
-	!! Load the address from the table.
-	LABEL_ADDRESS(offset_table, %g1)
-	sllx	%g4, 2, %g4
-	lduw	[%g1 + %g4], %g4
-	jmp	%g1 + %g4
-	add	%o2, 8, %o2		! src += 8, dst += 8
-
-.storexword:
-	stx	%o1, [%o2 - 8]		! store word to dst (address pre-incremented)
-
-1:
-	ldx	[%o2 + %o3], %o1	! src dword
-	add	%o2, 8, %o2		! src += 8, dst += 8
-	andn	%o5, %o1, %g1		! ~dword & 0x8080808080808080
-	sub	%o1, %o4, %g4		! dword - 0x0101010101010101
-	andcc	%g4, %g1, %g0		! ((dword - 0x0101010101010101) & ~dword & 0x8080808080808080)
-	bz,a,pt	%xcc, 1b		! no zero byte if magic expression == 0
-	  stx	%o1, [%o2 - 8]		! store word to dst (address pre-incremented)
-
-	ba,a	.zerobyte
-
-.storebyte1421:
-	!! Offset 3 or 7
-	srlx	%o1, 56, %g1		! %g1<7:0> = first byte; word aligned now
-	stb	%g1, [%o2 - 8]		! store first byte
-	srlx	%o1, 24, %g1		! %g1<31:0> = bytes 2, 3, 4, 5
-	stw	%g1, [%o2 - 7]		! store bytes 2, 3, 4, 5
-	srlx	%o1, 8, %g1		! %g1<15:0> = bytes 6, 7
-	sth	%g1, [%o2 - 3]		! store bytes 6, 7
-
-	stx	%l0, [%sp + STACK_OFFSET + 0]
-	and	%o2, 7, %g1
-	stx	%l1, [%sp + STACK_OFFSET + 8]
-	cmp	%g1, 3
-	stx	%l2, [%sp + STACK_OFFSET + 16]
-
-	move	%ncc, 40, %l0
-	move	%ncc, 24, %l1
-	move	%ncc, -11, %l2
-
-	movne	%ncc, 8, %l0
-	movne	%ncc, 56, %l1
-	movne	%ncc, -15, %l2
-
-	ba	.dstaligned
-	mov	%o1, %g5
-
-.storebyte1241:
-	!! Offset 1 or 5
-	srlx	%o1, 56, %g1		! %g1<7:0> = first byte; word aligned now
-	stb	%g1, [%o2 - 8]		! store first byte
-	srlx	%o1, 40, %g1		! %g1<15:0> = bytes 2, 3
-	sth	%g1, [%o2 - 7]		! store bytes 2, 3
-	srlx	%o1, 8, %g1		! %g1<31:0> = bytes 4, 5, 6, 7
-	stw	%g1, [%o2 - 5]		! store bytes 4, 5, 6, 7
-
-	stx	%l0, [%sp + STACK_OFFSET + 0]
-	and	%o2, 7, %g1
-	stx	%l1, [%sp + STACK_OFFSET + 8]
-	cmp	%g1, 1
-	stx	%l2, [%sp + STACK_OFFSET + 16]
-
-	move	%ncc, 56, %l0
-	move	%ncc, 8, %l1
-	move	%ncc, -9, %l2
-
-	movne	%ncc, 24, %l0
-	movne	%ncc, 40, %l1
-	movne	%ncc, -13, %l2
-
-	ba	.dstaligned
-	mov	%o1, %g5
-
-.storehalfword:
-	srlx	%o1, 48, %g1		! get first and second byte
-	sth	%g1, [%o2 - 8]		! store first and second byte; word aligned now
-	srlx	%o1, 16, %g1		! %g1<31:0> = bytes 3, 4, 5, 6
-	stw	%g1, [%o2 - 6]		! store bytes 3, 4, 5, 6
-
-	stx	%l0, [%sp + STACK_OFFSET + 0]
-	and	%o2, 7, %g1
-	stx	%l1, [%sp + STACK_OFFSET + 8]
-	cmp	%g1, 2
-	stx	%l2, [%sp + STACK_OFFSET + 16]
-
-	move	%ncc, 48, %l0
-	move	%ncc, 16, %l1
-	move	%ncc, -10, %l2
-
-	movne	%ncc, 16, %l0
-	movne	%ncc, 48, %l1
-	movne	%ncc, -14, %l2
-
-	ba	.dstaligned
-	mov	%o1, %g5
-
-.storeword:
-	srlx	%o1, 32, %g1		! get bytes 1,2,3,4
-	stw	%g1, [%o2 - 8]		! store bytes 1,2,3,4 (address is pre-incremented)
-
-	stx	%l0, [%sp + STACK_OFFSET + 0]
-	mov	32, %l0			! Num of bits to be shifted left
-	stx	%l1, [%sp + STACK_OFFSET + 8]
-	mov	32, %l1			! Num of bits to be shifted right
-	stx	%l2, [%sp + STACK_OFFSET + 16]
-	mov	-12, %l2		! -offset
-	mov	%o1, %g5
-
-	nop	! Do not delete. Used for alignment.
-.dstaligned:
-	ldx	[%o2 + %o3], %o1	! x = src[]
-	add	%o2, 8, %o2		! src += 8, dst += 8
-	andn	%o5, %o1, %g1		! ~x & 0x8080808080808080
-	sub	%o1, %o4, %g4		! x - 0x0101010101010101
-	andcc	%g4, %g1, %g0		! ((x - 0x0101010101010101) & ~x & 0x8080808080808080)
-	bnz,a,pn %xcc, .finishup	! x has zero byte, handle end cases
-	  stb	%g5, [%o2 - 9]
-
-	sllx	%g5, %l0, %g5
-	srlx	%o1, %l1, %g4
-	or	%g5, %g4, %g5
-
-	stx	%g5, [%o2 + %l2]
-	ba	.dstaligned
-	mov	%o1, %g5
-
-.finishup:
-	cmp	%l0, 56
-	be,pn	%ncc, .zerobyte_restore
-	andcc	%o2, 1, %g0
-	bnz,a	%ncc, 1f
-	  srlx	%g5, 8, %g5
-
-1:	srlx	%l1, 4, %g4	! g4 contains 1, 2 or 3
-	sub	%g4, 1, %g4	! multiple of 16
-	sllx	%g4, 4, %g4	! How many bits to shift
-	srlx	%g5, %g4, %l0
-	add	%o2, %l2, %g1
-
-2:	sth	%l0, [%g1]
-	sub	%g4, 16, %g4
-	add	%g1, 2, %g1
-	brgez,a,pt	%g4, 2b
-	  srlx	%g5, %g4, %l0
-
-.zerobyte_restore:
-	ldx	[%sp + STACK_OFFSET + 0], %l0
-	andn	%o5, %o1, %o3		! ~val & 0x80
-	ldx	[%sp + STACK_OFFSET + 8], %l1
-	sub	%o1, %o4, %g1		! val - 0x01
-	ldx	[%sp + STACK_OFFSET + 16], %l2
-
-	ba	1f
-	andcc	%o3, %g1, %o3		! ~val & 0x80 & (val - 0x01)
-
-.zerobyte:
-	!! %o5:	0x8080808080808080
-	!! %o4: 0x0101010101010101
-	!! %o1: Left justified dowrd that contains 0 byte
-	!! %o2: Address to be written + 8
-
-	andn	%o5, %o1, %o3		! ~val & 0x80
-	sub	%o1, %o4, %g1		! val - 0x01
-	andcc	%o3, %g1, %o3		! ~val & 0x80 & (val - 0x01)
-
-1:	srlx	%o3, 7, %o3		! shift 0x80 -> 0x01
-	andn	%o3, %o1, %o3		! mask off leading 0x01 bytes
-	lzd	%o3, %o4		! 7, 15, ... 63
-
-	mov	64, %o5			! Calc # of bytes to be discarded
-	inc	%o4			! Include the zero byte too
-	sub	%o5, %o4, %o5		! after the null byte
-	sub	%o2, 8, %o2		! Adjust address which is +8 here.
-	srlx	%o1, %o5, %o1		! Discard them
-
-	srlx	%o4, 3, %o4		! Bits to bytes to be written
-	dec	%o4			! dec 1 to use it as offset
-
-2:	stub	%o1, [%o2 + %o4]
-	dec	%o4
-	brgez,pt %o4, 2b
-	srlx	%o1, 8, %o1
-
-#ifndef	__sparcv9
-	ldx	[%sp + STACK_OFFSET + 24], %g4
-	ldx	[%sp + STACK_OFFSET + 32], %g5
-#endif
-	retl				! done with leaf function
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-	SET_SIZE(strcpy)
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s
deleted file mode 100644
index d2683ef381..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-	.file	"strlen.s"
-
-/*
- * strlen(s)
- *
- * Given string s, return length (not including the terminating null).
- *
- * Fast assembler language version of the following C-program strlen
- * which represents the `standard' for the C-library.
- *
- *	size_t
- *	strlen(s)
- *	register const char *s;
- *	{
- *		register const char *s0 = s + 1;
- *
- *		while (*s++ != '\0')
- *			;
- *		return (s - s0);
- *	}
- */
-
-#include <sys/asm_linkage.h>
-
-	/*
-	 * There are two key optimizations in the routine below.
-	 * First, all memory accesses are 8 bytes wide.  The time
-	 * for long strings is dominated by the latency of load
-	 * instructions in the inner loop, and going 8 bytes at
-	 * a time means 1/8th as much latency.
-	 *
-	 * Scanning an 8 byte word for a '\0' is made fast by
-	 * this formula (due to Alan Mycroft):
-	 *     ~x & 0x808080808080 & (x - 0x0101010101010101)
-	 * The result of this formula is non-zero iff there's
-	 * a '\0' somewhere in x.
-	 *
-	 * Second, the cost of short strings is dominated by the
-	 * cost of figuring out which byte out of the last 8
-	 * contained the '\0' that terminated the string.  We use
-	 * properties of the formula above to convert scanning the
-	 * word for '\0' into a single LZD instruction.
-	 */
-	.align	64
-	.skip	4*4	! force .findnull to align to 64 bytes
-	ENTRY_NP(strlen)
-	and	%o0, 7, %o3			! off = addr & 7
-	sethi	%hi(0x01010101), %o4		! 0x01010000
-
-	sub	%g0, %o3, %o2			! count = -off
-	or	%o4, %lo(0x01010101), %o4	! 0x01010101
-
-	ldx	[%o0 + %o2], %o1		! val = *(addr + count)
-	sllx	%o4, 32, %o5			! 0x01010101 << 32
-
-	mov	-1, %g1				! mask = -1
-	sllx	%o3, 3, %o3			! shift = off * 8
-
-	or	%o4, %o5, %o4			! 0x0101010101010101
-	srlx	%g1, %o3, %g1			! -1 >> ((addr & 7) * 8)
-
-	sllx	%o4, 7, %o5			! 0x8080808080808080
-	orn	%o1, %g1, %o1			! val |= ~mask
-.strlen_findnull:
-	!! %o0 - base address
-	!! %o1 - xword from memory
-	!! %o2 - index
-	!! %o3 - result of test for '\0'
-	!! %o4 - constant 0x0101.0101.0101.0101
-	!! %o5 - constant 0x8080.8080.8080.8080
-	!! %g1 - scratch
-	andn	%o5, %o1, %o3		! ~val & 0x80
-	sub	%o1, %o4, %g1		! val - 0x01
-	andcc	%o3, %g1, %o3		! ~val & 0x80 & (val - 0x01)
-	inc	8, %o2
-	bz,a,pt	%xcc, .strlen_findnull
-	  ldx	[%o0 + %o2], %o1
-
-	/*
-	 * The result of Mycroft's formula is a pattern of 0x80 and
-	 * 0x00 bytes.  There's a 0x80 at every byte position where
-	 * there was a '\0' character, but a string of 0x01 bytes
-	 * immediately preceding a '\0' becomes a corresponding
-	 * string of 0x80 bytes.  (e.g. 0x0101010101010100 becomes
-	 * 0x8080808080808080).  We need one final step to discount
-	 * any leading 0x01 bytes, and then LZD can tell us how many
-	 * characters there were before the terminating '\0'.
-	 */
-	!! %o1 - last data word
-	!! %o2 - length+8, plus 1-8 extra
-	!! %o3 - xword with 0x80 for each 0x00 byte and leading 0x01
-	sub	%o2, 8, %o2		! subtract off '\0' and last 8
-	srlx	%o3, 7, %o3		! shift 0x80 -> 0x01
-	andn	%o3, %o1, %o3		! mask off leading 0x01 bytes
-	lzd	%o3, %o3		! 7, 15, ... 63
-	srlx	%o3, 3, %o3		! 0 ... 7
-
-	retl
-	add	%o2, %o3, %o0		! add back bytes before '\0'
-
-	SET_SIZE(strlen)
diff --git a/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile b/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile
index 3a299a35e0..d648203adc 100644
--- a/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile
+++ b/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile
@@ -28,8 +28,7 @@ LIBCBASE=	$(SRC)/lib/libc/sparc
 
 LIBRARY=	libc_hwcap1.a
 
-EXTN_CPPFLAGS=	-DSMT_PAUSE_FUNCTION=_rock_pause \
-		-I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
+EXTN_CPPFLAGS=	-I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
 EXTN_ASFLAGS=	-xarch=v8plusd
 EXTN_DYNFLAGS=	-M mapfile
 
@@ -40,10 +39,10 @@ OPTIMIZED_LIBCBASE=../common
 
 PRFOBJS=		\
 	memcpy.o	\
+	memmove.o	\
 	memset.o	\
 	strlen.o	\
 	strcpy.o	\
-	misc.o
 
 MAPFILE_AUX =	mapfile-vers-aux
 
diff --git a/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile b/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile
index 7065a134f2..451d682145 100644
--- a/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile
+++ b/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile
@@ -27,8 +27,7 @@ LIBCBASE=	$(SRC)/lib/libc/sparcv9
 
 LIBRARY=	libc_hwcap1.a
 
-EXTN_CPPFLAGS=	-DSMT_PAUSE_FUNCTION=_rock_pause \
-		-I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
+EXTN_CPPFLAGS=	-I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
 EXTN_ASFLAGS=	-xarch=v9d
 EXTN_DYNFLAGS=	-M mapfile
 
@@ -39,10 +38,10 @@ OPTIMIZED_LIBCBASE=../common
 
 PRFOBJS=		\
 	memcpy.o	\
+	memmove.o	\
 	memset.o	\
 	strlen.o	\
 	strcpy.o	\
-	misc.o
 
 MAPFILE_AUX =	mapfile-vers-aux
author	Jason Beloro <Jason.Beloro@Sun.COM>	2009-08-06 17:39:39 -0700
committer	Jason Beloro <Jason.Beloro@Sun.COM>	2009-08-06 17:39:39 -0700
commit	9d0d62ad2e60e8f742a2e723d06e88352ee6a1f3 (patch)
tree	016e2a6b2f674016c46785258d0ff85e6b1bce09 /usr/src/lib/libc
parent	32a6953793c636df949ca1ae3555438159bda3f6 (diff)
download	illumos-joyent-9d0d62ad2e60e8f742a2e723d06e88352ee6a1f3.tar.gz