summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc
diff options
context:
space:
mode:
authorJason Beloro <Jason.Beloro@Sun.COM>2009-08-06 17:39:39 -0700
committerJason Beloro <Jason.Beloro@Sun.COM>2009-08-06 17:39:39 -0700
commit9d0d62ad2e60e8f742a2e723d06e88352ee6a1f3 (patch)
tree016e2a6b2f674016c46785258d0ff85e6b1bce09 /usr/src/lib/libc
parent32a6953793c636df949ca1ae3555438159bda3f6 (diff)
downloadillumos-joyent-9d0d62ad2e60e8f742a2e723d06e88352ee6a1f3.tar.gz
6858457 Remove Solaris support for UltraSPARC-AT10 processor
Diffstat (limited to 'usr/src/lib/libc')
-rw-r--r--usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s1704
-rw-r--r--usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s767
-rw-r--r--usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s33
-rw-r--r--usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s340
-rw-r--r--usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s127
-rw-r--r--usr/src/lib/libc/sparc_hwcap1/sparc/Makefile5
-rw-r--r--usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile5
7 files changed, 4 insertions, 2977 deletions
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s
deleted file mode 100644
index 8fdb95268f..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s
+++ /dev/null
@@ -1,1704 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
- .file "memcpy.s"
-
-/*
- * memcpy(s1, s2, len)
- *
- * Copy s2 to s1, always copy n bytes.
- * Note: this C code does not work for overlapped copies.
- * Memmove() and bcopy() do.
- *
- * Added entry __align_cpy_1 is generally for use of the compilers.
- *
- * Fast assembler language version of the following C-program for memcpy
- * which represents the `standard' for the C-library.
- *
- * void *
- * memcpy(void *s, const void *s0, size_t n)
- * {
- * if (n != 0) {
- * char *s1 = s;
- * const char *s2 = s0;
- * do {
- * *s1++ = *s2++;
- * } while (--n != 0);
- * }
- * return (s);
- * }
- */
-
-#include <sys/asm_linkage.h>
-#include <sys/sun4asi.h>
-#include <sys/trap.h>
-
-#ifdef __sparcv9
-#define SAVESIZE (8 * 1)
-#define STACK_OFFSET (STACK_BIAS + MINFRAME)
-#else
-#define SAVESIZE (8 * 3)
-#define STACK_OFFSET (STACK_BIAS + MINFRAME + 4)
-#endif
-
-#define scratch_offset 0
-#define g4_offset 8
-#define g5_offset 16
-
-#define ICACHE_LINE_SIZE 64
-#define BLOCK_SIZE 64
-#define FPRS_FEF 0x4
-#define PF_FAR 2048
-#define PF_NEAR 1024
-
-#define SHORTCOPY 3
-#define SMALL_MAX 39
-#define MEDIUM_MAX 255
-#define MED_WMAX 256 /* max copy for medium word-aligned case */
-#define MED_MAX 256 /* max copy for medium longword-aligned case */
-
-#ifndef BSTORE_SIZE
-#define BSTORE_SIZE 256 /* min copy size for block store */
-#endif
-
-/*
- * The LDDs will use the below ASI for performance
- * This ASI minimizes cache pollution.
- */
-#define ASI_CACHE_SPARING 0xf4
-#define ASI_CACHE_SPARING_PRIMARY 0xf4
-
- ANSI_PRAGMA_WEAK(memmove,function)
- ANSI_PRAGMA_WEAK(memcpy,function)
-
- ENTRY(memmove)
- cmp %o1, %o0 ! if from address is >= to use forward copy
- bgeu %ncc, .forcpy ! else use backward if ...
- sub %o0, %o1, %o4 ! get difference of two addresses
- cmp %o2, %o4 ! compare size and difference of addresses
- bleu %ncc, .forcpy ! if size is bigger, do overlapped copy
- nop
-
- !
- ! an overlapped copy that must be done "backwards"
- !
-.ovbc:
- mov %o0, %g1 ! save dest address for return val
- add %o1, %o2, %o1 ! get to end of source space
- add %o0, %o2, %o0 ! get to end of destination space
-
- cmp %o2, 24
- bgeu,pn %ncc, .dbalign
- nop
- cmp %o2, 4
- blt,pn %ncc, .byte
- sub %o2, 3, %o2
-.byte4loop:
- ldub [%o1-1], %o3 ! load last byte
- stb %o3, [%o0-1] ! store last byte
- sub %o1, 4, %o1
- ldub [%o1+2], %o3 ! load 2nd from last byte
- stb %o3, [%o0-2] ! store 2nd from last byte
- sub %o0, 4, %o0
- ldub [%o1+1], %o3 ! load 3rd from last byte
- stb %o3, [%o0+1] ! store 3rd from last byte
- subcc %o2, 4, %o2
- ldub [%o1], %o3 ! load 4th from last byte
- bgu,pt %ncc, .byte4loop
- stb %o3, [%o0] ! store 4th from last byte
-.byte:
- addcc %o2, 3, %o2
- bz,pt %ncc, .exit
-.byteloop:
- dec %o1 ! decrement src address
- ldub [%o1], %o3 ! read a byte
- dec %o0 ! decrement dst address
- deccc %o2 ! decrement count
- bgu,pt %ncc, .byteloop ! loop until done
- stb %o3, [%o0] ! write byte
-.exit:
- retl
- mov %g1, %o0
-
- .align 16
-.dbalign:
- andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned
- bz,pt %ncc, .dbmed
- sub %o2, %o5, %o2 ! update count
-.dbalign1:
- dec %o1 ! decrement src address
- ldub [%o1], %o3 ! read a byte
- dec %o0 ! decrement dst address
- deccc %o5 ! decrement count
- bgu,pt %ncc, .dbalign1 ! loop until done
- stb %o3, [%o0] ! store a byte
-
-! check for src long word alignment
-.dbmed:
- mov %asi, %g5 ! save curr %asi
- wr %g0, ASI_CACHE_SPARING, %asi
- andcc %o1, 7, %g0 ! chk src long word alignment
- bnz,pn %ncc, .dbbck
- nop
-!
-! Following code is for overlapping copies where src and dest
-! are long word aligned
-!
- cmp %o2, 4095
- blt,pn %ncc, .dbmedl32enter ! go to no prefetch code
- nop
- prefetch [%o1 - (1 * BLOCK_SIZE)], #n_reads
- sub %o2, 63, %o2 ! adjust length to allow cc test
- ! for end of loop
- prefetch [%o1 - (2 * BLOCK_SIZE)], #n_reads
- prefetch [%o1 - (3 * BLOCK_SIZE)], #n_reads
- prefetch [%o1 - (4 * BLOCK_SIZE)], #n_reads
-.dbmedl64:
- prefetch [%o1 - (5 * BLOCK_SIZE)], #n_reads
- ldxa [%o1-8]%asi, %o3 ! load
- subcc %o2, 64, %o2 ! decrement length count
- stx %o3, [%o0-8] ! and store
- ldxa [%o1-16]%asi, %o3 ! a block of 64 bytes
- sub %o1, 64, %o1 ! decrease src ptr by 64
- stx %o3, [%o0-16]
- sub %o0, 64, %o0 ! decrease dst ptr by 64
- ldxa [%o1+40]%asi, %o3
- ldxa [%o1+32]%asi, %o4
- ldxa [%o1+24]%asi, %o5
- stx %o3, [%o0+40]
- stx %o4, [%o0+32]
- stx %o5, [%o0+24]
- ldxa [%o1+16]%asi, %o3
- ldxa [%o1+8]%asi, %o4
- stx %o3, [%o0+16]
- stx %o4, [%o0+8]
- ldxa [%o1]%asi, %o5
- bgu,pt %ncc, .dbmedl64 ! repeat if at least 64 bytes left
- stx %o5, [%o0]
- add %o2, 63, %o2 ! restore offset adjustment
-.dbmedl32enter:
- subcc %o2, 31, %o2 ! adjust length to allow cc test
- ! for end of loop
- ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32
- nop
-.dbmedl32:
- ldx [%o1-8], %o4 ! load
- subcc %o2, 32, %o2 ! decrement length count
- stx %o4, [%o0-8] ! and store
- ldx [%o1-16], %o3 ! a block of 32 bytes
- sub %o1, 32, %o1 ! decrease src ptr by 32
- stx %o3, [%o0-16]
- ldx [%o1+8], %o4
- sub %o0, 32, %o0 ! decrease dst ptr by 32
- stx %o4, [%o0+8]
- ldx [%o1], %o3
- bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left
- stx %o3, [%o0]
-.dbmedl31:
- addcc %o2, 16, %o2 ! adjust remaining count
- ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left
- nop !
- ldx [%o1-8], %o4 ! load and store 16 bytes
- sub %o1, 16, %o1 ! decrease src ptr by 16
- stx %o4, [%o0-8] !
- sub %o2, 16, %o2 ! decrease count by 16
- ldx [%o1], %o3 !
- sub %o0, 16, %o0 ! decrease dst ptr by 16
- stx %o3, [%o0]
-.dbmedl15:
- addcc %o2, 15, %o2 ! restore count
- bz,pt %ncc, .dbexit ! exit if finished
- nop
- cmp %o2, 8
- blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left
- nop
- ldx [%o1-8], %o4 ! load 8 bytes
- sub %o1, 8, %o1 ! decrease src ptr by 8
- stx %o4, [%o0-8] ! and store 8 bytes
- subcc %o2, 8, %o2 ! decrease count by 8
- bnz %ncc, .dbremain ! exit if finished
- sub %o0, 8, %o0 ! decrease dst ptr by 8
- mov %g5, %asi ! restore %asi
- retl
- mov %g1, %o0
-
-!
-! Following code is for overlapping copies where src and dest
-! are not long word aligned
-!
- .align 16
-.dbbck:
- rd %fprs, %o3 ! o3 = fprs
-
- ! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
- ! So set it anyway, without checking.
- wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
-
- alignaddr %o1, %g0, %o5 ! align src
- ldda [%o5]%asi, %d0 ! get first 8 byte block
- andn %o2, 7, %o4 ! prepare src ptr for finishup code
- cmp %o2, 32
- blt,pn %ncc, .dbmv8
- sub %o1, %o4, %o1 !
- cmp %o2, 4095 ! check for short memmoves
- blt,pn %ncc, .dbmv32enter ! go to no prefetch code
-.dbmv64:
- ldda [%o5-8]%asi, %d2 ! load 8 bytes
- ldda [%o5-16]%asi, %d4 ! load 8 bytes
- sub %o5, 64, %o5 !
- ldda [%o5+40]%asi, %d6 ! load 8 bytes
- sub %o0, 64, %o0 !
- ldda [%o5+32]%asi, %d8 ! load 8 bytes
- sub %o2, 64, %o2 ! 64 less bytes to copy
- ldda [%o5+24]%asi, %d18 ! load 8 bytes
- cmp %o2, 64 ! do we have < 64 bytes remaining
- ldda [%o5+16]%asi, %d28 ! load 8 bytes
- ldda [%o5+8]%asi, %d30 ! load 8 bytes
- prefetch [%o5 - (5 * BLOCK_SIZE)], #n_reads
- faligndata %d2, %d0, %d10 ! extract 8 bytes out
- ldda [%o5]%asi, %d0 ! load 8 bytes
- std %d10, [%o0+56] ! store the current 8 bytes
- faligndata %d4, %d2, %d12 ! extract 8 bytes out
- std %d12, [%o0+48] ! store the current 8 bytes
- faligndata %d6, %d4, %d14 ! extract 8 bytes out
- std %d14, [%o0+40] ! store the current 8 bytes
- faligndata %d8, %d6, %d16 ! extract 8 bytes out
- std %d16, [%o0+32] ! store the current 8 bytes
- faligndata %d18, %d8, %d20 ! extract 8 bytes out
- std %d20, [%o0+24] ! store the current 8 bytes
- faligndata %d28, %d18, %d22 ! extract 8 bytes out
- std %d22, [%o0+16] ! store the current 8 bytes
- faligndata %d30, %d28, %d24 ! extract 8 bytes out
- std %d24, [%o0+8] ! store the current 8 bytes
- faligndata %d0, %d30, %d26 ! extract 8 bytes out
- bgeu,pt %ncc, .dbmv64
- std %d26, [%o0] ! store the current 8 bytes
-
- cmp %o2, 32
- blt,pn %ncc, .dbmvx
- nop
-.dbmv32:
- ldda [%o5-8]%asi, %d2 ! load 8 bytes
-.dbmv32enter:
- ldda [%o5-16]%asi, %d4 ! load 8 bytes
- sub %o5, 32, %o5 !
- ldda [%o5+8]%asi, %d6 ! load 8 bytes
- sub %o0, 32, %o0 !
- faligndata %d2, %d0, %d10 ! extract 8 bytes out
- ldda [%o5]%asi, %d0 ! load 8 bytes
- sub %o2,32, %o2 ! 32 less bytes to copy
- std %d10, [%o0+24] ! store the current 8 bytes
- cmp %o2, 32 ! do we have < 32 bytes remaining
- faligndata %d4, %d2, %d12 ! extract 8 bytes out
- std %d12, [%o0+16] ! store the current 8 bytes
- faligndata %d6, %d4, %d14 ! extract 8 bytes out
- std %d14, [%o0+8] ! store the current 8 bytes
- faligndata %d0, %d6, %d16 ! extract 8 bytes out
- bgeu,pt %ncc, .dbmv32
- std %d16, [%o0] ! store the current 8 bytes
-.dbmvx:
- cmp %o2, 8 ! do we have < 8 bytes remaining
- blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code
- nop
-.dbmv8:
- ldda [%o5-8]%asi, %d2
- sub %o0, 8, %o0 ! since we are at the end
- ! when we first enter the loop
- sub %o2, 8, %o2 ! 8 less bytes to copy
- sub %o5, 8, %o5
- cmp %o2, 8 ! do we have < 8 bytes remaining
- faligndata %d2, %d0, %d8 ! extract 8 bytes out
- std %d8, [%o0] ! store the current 8 bytes
- bgeu,pt %ncc, .dbmv8
- fmovd %d2, %d0
-.dbmvfinish:
- and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0
- tst %o2
- bz,pt %ncc, .dbexit
- wr %o3, %g0, %fprs ! fprs = o3 restore fprs
-
-.dbremain:
- cmp %o2, 4
- blt,pn %ncc, .dbbyte
- nop
- ldub [%o1-1], %o3 ! load last byte
- stb %o3, [%o0-1] ! store last byte
- sub %o1, 4, %o1
- ldub [%o1+2], %o3 ! load 2nd from last byte
- stb %o3, [%o0-2] ! store 2nd from last byte
- sub %o0, 4, %o0
- ldub [%o1+1], %o3 ! load 3rd from last byte
- stb %o3, [%o0+1] ! store 3rd from last byte
- subcc %o2, 4, %o2
- ldub [%o1], %o3 ! load 4th from last byte
- stb %o3, [%o0] ! store 4th from last byte
- bz,pt %ncc, .dbexit
-.dbbyte:
- dec %o1 ! decrement src address
- ldub [%o1], %o3 ! read a byte
- dec %o0 ! decrement dst address
- deccc %o2 ! decrement count
- bgu,pt %ncc, .dbbyte ! loop until done
- stb %o3, [%o0] ! write byte
-.dbexit:
- mov %g5, %asi ! restore %asi
- retl
- mov %g1, %o0
- SET_SIZE(memmove)
-
- .align ICACHE_LINE_SIZE
- ENTRY(memcpy)
- ENTRY(__align_cpy_1)
- ! adjust instruction alignment
- nop ! Do not remove, these nops affect
- nop ! icache alignment and performance
-.forcpy:
- cmp %o2, SMALL_MAX ! check for not small case
- bgu,pn %ncc, .medium ! go to larger cases
- mov %o0, %g1 ! save %o0
- cmp %o2, SHORTCOPY ! check for really short case
- ble,pt %ncc, .smallleft !
- or %o0, %o1, %o3 ! prepare alignment check
- andcc %o3, 0x3, %g0 ! test for alignment
- bz,pt %ncc, .smallword ! branch to word aligned case
- sub %o2, 3, %o2 ! adjust count to allow cc zero test
-.smallnotalign4:
- ldub [%o1], %o3 ! read byte
- subcc %o2, 4, %o2 ! reduce count by 4
- stb %o3, [%o0] ! write byte
- ldub [%o1+1], %o3 ! repeat for a total of 4 bytes
- add %o1, 4, %o1 ! advance SRC by 4
- stb %o3, [%o0+1]
- ldub [%o1-2], %o3
- add %o0, 4, %o0 ! advance DST by 4
- stb %o3, [%o0-2]
- ldub [%o1-1], %o3
- bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain
- stb %o3, [%o0-1]
- add %o2, 3, %o2 ! restore count
-.smallleft:
- tst %o2
- bz,pt %ncc, .smallexit
- nop
-.smallleft3: ! 1, 2, or 3 bytes remain
- ldub [%o1], %o3 ! load one byte
- deccc %o2 ! reduce count for cc test
- bz,pt %ncc, .smallexit
- stb %o3, [%o0] ! store one byte
- ldub [%o1+1], %o3 ! load second byte
- deccc %o2
- bz,pt %ncc, .smallexit
- stb %o3, [%o0+1] ! store second byte
- ldub [%o1+2], %o3 ! load third byte
- stb %o3, [%o0+2] ! store third byte
- retl
- mov %g1, %o0 ! restore %o0
-
- .align 16
- nop ! affects loop icache alignment
-.smallwords:
- lduw [%o1], %o3 ! read word
-.smallwordx:
- subcc %o2, 8, %o2 ! update count
- stw %o3, [%o0] ! write word
- add %o1, 8, %o1 ! update SRC
- lduw [%o1-4], %o3 ! read word
- add %o0, 8, %o0 ! update DST
- bgu,pt %ncc, .smallwords ! loop until done
- stw %o3, [%o0-4] ! write word
- addcc %o2, 7, %o2 ! restore count
- bz,pt %ncc, .smallexit ! check for completion
- nop
- cmp %o2, 4 ! check for 4 or more bytes left
- blt .smallleft3 ! if not, go to finish up
- nop
- lduw [%o1], %o3
- add %o1, 4, %o1
- subcc %o2, 4, %o2
- stw %o3, [%o0]
- add %o0, 4, %o0
- bnz,pt %ncc, .smallleft3
- nop
- retl
- mov %g1, %o0 ! restore %o0
-
-.smallword:
- subcc %o2, 4, %o2 ! update count
- bgu,pt %ncc, .smallwordx
- lduw [%o1], %o3 ! read word
- addcc %o2, 3, %o2 ! restore count
- bz,pt %ncc, .smallexit
- stw %o3, [%o0] ! write word
- deccc %o2 ! reduce count for cc test
- ldub [%o1+4], %o3 ! load one byte
- bz,pt %ncc, .smallexit
- stb %o3, [%o0+4] ! store one byte
- ldub [%o1+5], %o3 ! load second byte
- deccc %o2
- bz,pt %ncc, .smallexit
- stb %o3, [%o0+5] ! store second byte
- ldub [%o1+6], %o3 ! load third byte
- stb %o3, [%o0+6] ! store third byte
-.smallexit:
- retl
- mov %g1, %o0 ! restore %o0
- .align 16
-.medium:
- neg %o0, %o5
- neg %o1, %o3
- andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
- and %o3, 7, %o3 ! bytes till SRC 8 byte aligned
- cmp %o5, %o3
- bne %ncc, continue
- sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned)
- ! o3={-7, -6, ... 7} o3>0 => SRC overaligned
- ! src and dst are aligned.
- mov %o3, %g5 ! save %o3
- andcc %o1, 7, %o3 ! is src buf aligned on a 8 byte bound
- brz,pt %o3, src_dst_aligned_on_8
- mov %o3, %o5
- mov 8, %o4
- sub %o4, %o3, %o3
- cmp %o3, %o2
- bg,a,pn %ncc, 1f
- mov %o2, %o3
-1:
- ! %o3 has the bytes to be written in partial store.
- sub %o2, %o3, %o2
- prefetch [%o1],2
-
-7:
- deccc %o3 ! byte clearing loop
- ldub [%o1], %o4 ! load one byte
- stb %o4, [%o0]
- inc %o1 ! increment src
- bgu,pt %ncc, 7b
- inc %o0 ! increment dst
-
- mov %g5, %o3 ! restore %o3
-src_dst_aligned_on_8:
- ! check if we are copying 1k or more bytes
- cmp %o2, 511
- bgu,pt %ncc, copying_ge_512
- nop
- ba .medlword
- nop
-
-continue:
- andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
- bz %ncc, 2f
- nop
-
- sub %o2, %o5, %o2 ! update count
-
-1:
- ldub [%o1], %o4
- deccc %o5
- inc %o1
- stb %o4, [%o0]
- bgu,pt %ncc, 1b
- inc %o0
-
- ! Now DST is 8-byte aligned. o0, o1, o2 are current.
-
-2:
- andcc %o1, 0x3, %g0 ! test alignment
- bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases
- ! if src, dst not aligned
- prefetch [%o1 + (1 * BLOCK_SIZE)], #n_reads
-
-/*
- * Handle all cases where src and dest are aligned on word
- * or long word boundaries. Use unrolled loops for better
- * performance. This option wins over standard large data
- * move when source and destination is in cache for medium
- * to short data moves.
- */
- andcc %o1, 0x7, %g0 ! test word alignment
- bz,pt %ncc, src_dst_lword_aligned ! branch to long word aligned case
- prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads
- cmp %o2, MED_WMAX ! limit to store buffer size
- bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
- nop
- subcc %o2, 15, %o2 ! adjust length to allow cc test
- ! for end of loop
- ble,pt %ncc, .medw15 ! skip big loop if less than 16
- prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads
-/*
- * no need to put prefetch in loop as prefetches have
- * already been issued for maximum loop size
- */
-.medw16:
- ld [%o1], %o4 ! load
- subcc %o2, 16, %o2 ! decrement length count
- stw %o4, [%o0] ! and store
- ld [%o1+4], %o3 ! a block of 16 bytes
- add %o1, 16, %o1 ! increase src ptr by 16
- stw %o3, [%o0+4]
- ld [%o1-8], %o4
- add %o0, 16, %o0 ! increase dst ptr by 16
- stw %o4, [%o0-8]
- ld [%o1-4], %o3
- bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left
- stw %o3, [%o0-4]
-.medw15:
- addcc %o2, 15, %o2 ! restore count
- bz,pt %ncc, .medwexit ! exit if finished
- nop
- cmp %o2, 8
- blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
- nop !
- ld [%o1], %o4 ! load 4 bytes
- subcc %o2, 8, %o2 ! decrease count by 8
- stw %o4, [%o0] ! and store 4 bytes
- add %o1, 8, %o1 ! increase src ptr by 8
- ld [%o1-4], %o3 ! load 4 bytes
- add %o0, 8, %o0 ! increase dst ptr by 8
- stw %o3, [%o0-4] ! and store 4 bytes
- bz %ncc, .medwexit ! exit if finished
- nop
-.medw7: ! count is ge 1, less than 8
- cmp %o2, 3 ! check for 4 bytes left
- ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left
- nop !
- ld [%o1], %o4 ! load 4 bytes
- sub %o2, 4, %o2 ! decrease count by 4
- add %o1, 4, %o1 ! increase src ptr by 4
- stw %o4, [%o0] ! and store 4 bytes
- add %o0, 4, %o0 ! increase dst ptr by 4
- tst %o2 ! check for zero bytes left
- bz %ncc, .medwexit ! exit if finished
- nop
-.medw3: ! count is known to be 1, 2, or 3
- deccc %o2 ! reduce count by one
- ldub [%o1], %o3 ! load one byte
- bz,pt %ncc, .medwexit ! exit if last byte
- stb %o3, [%o0] ! store one byte
- ldub [%o1+1], %o3 ! load second byte
- deccc %o2 ! reduce count by one
- bz,pt %ncc, .medwexit ! exit if last byte
- stb %o3, [%o0+1] ! store second byte
- ldub [%o1+2], %o3 ! load third byte
- stb %o3, [%o0+2] ! store third byte
-.medwexit:
- retl
- mov %g1, %o0 ! restore %o0
-
-/*
- * Special case for handling when src and dest are both long word aligned
- * and total data to move is between SMALL_MAX and MED_MAX bytes
- */
-
- .align 16
- nop
-src_dst_lword_aligned:
-.medlword: ! long word aligned
- cmp %o2, MED_MAX ! limit to store buffer size
- bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
- nop
- subcc %o2, 31, %o2 ! adjust length to allow cc test
- ! for end of loop
- ble,pt %ncc, .medl31 ! skip big loop if less than 32
- prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads ! into the l2 cache
-/*
- * no need to put prefetch in loop as prefetches have
- * already been issued for maximum loop size
- */
-.medl32:
- ldx [%o1], %o4 ! load
- subcc %o2, 32, %o2 ! decrement length count
- stx %o4, [%o0] ! and store
- ldx [%o1+8], %o3 ! a block of 32 bytes
- add %o1, 32, %o1 ! increase src ptr by 32
- stx %o3, [%o0+8]
- ldx [%o1-16], %o4
- add %o0, 32, %o0 ! increase dst ptr by 32
- stx %o4, [%o0-16]
- ldx [%o1-8], %o3
- bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left
- stx %o3, [%o0-8]
-.medl31:
- addcc %o2, 16, %o2 ! adjust remaining count
- ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left
- nop !
- ldx [%o1], %o4 ! load and store 16 bytes
- add %o1, 16, %o1 ! increase src ptr by 16
- stx %o4, [%o0] !
- sub %o2, 16, %o2 ! decrease count by 16
- ldx [%o1-8], %o3 !
- add %o0, 16, %o0 ! increase dst ptr by 16
- stx %o3, [%o0-8]
-.medl15:
- addcc %o2, 15, %o2 ! restore count
- bz,pt %ncc, .medwexit ! exit if finished
- nop
- cmp %o2, 8
- blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left
- nop
- ldx [%o1], %o4 ! load 8 bytes
- add %o1, 8, %o1 ! increase src ptr by 8
- stx %o4, [%o0] ! and store 8 bytes
- subcc %o2, 8, %o2 ! decrease count by 8
- bz %ncc, .medwexit ! exit if finished
- add %o0, 8, %o0 ! increase dst ptr by 8
- ba .medw7
- nop
-
- .align 16
- nop
- nop
- nop
-unaligned_src_dst:
-
-.mediumsetup:
- prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
-.mediumrejoin:
- rd %fprs, %o4 ! check for unused fp
-
- add %o1, 8, %o1 ! prepare to round SRC upward
-
- sethi %hi(0x1234567f), %o5 ! For GSR.MASK
- or %o5, 0x67f, %o5
- andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0
- bz,a %ncc, 3f
- wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
-3:
- cmp %o2, MEDIUM_MAX
- bmask %o5, %g0, %g0
-
- ! Compute o5 (number of bytes that need copying using the main loop).
- ! First, compute for the medium case.
- ! Then, if large case, o5 is replaced by count for block alignment.
- ! Be careful not to read past end of SRC
- ! Currently, o2 is the actual count remaining
- ! o3 is how much sooner we'll cross the alignment boundary
- ! in SRC compared to in DST
- !
- ! Examples: Let # denote bytes that should not be accessed
- ! Let x denote a byte already copied to align DST
- ! Let . and - denote bytes not yet copied
- ! Let | denote double alignment boundaries
- !
- ! DST: ######xx|........|--------|..###### o2 = 18
- ! o0
- !
- ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8
- ! o1
- !
- ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8
- ! o1
- !
- ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8
- ! o1
-
- mov %asi, %g5 ! save curr %asi
- wr %g0, ASI_CACHE_SPARING, %asi
-
- or %g0, -8, %o5
- alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1
-
- movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0
- add %o5, %o2, %o5
- add %o5, %o3, %o5
-
- bleu %ncc, 4f
- andn %o5, 7, %o5 ! 8 byte aligned count
- neg %o0, %o5 ! 'large' case
- and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned
-4:
- brgez,a %o3, .beginmedloop
- ldda [%o1-8]%asi, %d0
-
- add %o1, %o3, %o1 ! back up o1
-5:
- ldda [%o1]ASI_FL8_P, %d2
- inc %o1
- andcc %o1, 7, %g0
- bnz %ncc, 5b
- bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
-
-.beginmedloop:
- tst %o5
- bz %ncc, .endmedloop
- sub %o2, %o5, %o2 ! update count for later
-
- ! Main loop to write out doubles. Note: o5 & 7 == 0
-
- ldd [%o1], %d2
- subcc %o5, 8, %o5 ! update local count
- bz,pn %ncc, 1f
- add %o1, 8, %o1 ! update SRC
-
-.medloop:
- faligndata %d0, %d2, %d4
- ldda [%o1]%asi, %d0
- subcc %o5, 8, %o5 ! update local count
- add %o1, 16, %o1 ! update SRC
- std %d4, [%o0]
- bz,pn %ncc, 2f
- faligndata %d2, %d0, %d6
- ldda [%o1 - 8]%asi, %d2
- subcc %o5, 8, %o5 ! update local count
- std %d6, [%o0 + 8]
- bnz,pt %ncc, .medloop
- add %o0, 16, %o0 ! update DST
-
-1:
- faligndata %d0, %d2, %d4
- fmovd %d2, %d0
- std %d4, [%o0]
- ba .endmedloop
- add %o0, 8, %o0
-
-2:
- std %d6, [%o0 + 8]
- sub %o1, 8, %o1
- add %o0, 16, %o0
-
-
-.endmedloop:
- ! Currently, o1 is pointing to the next double-aligned byte in SRC
- ! The 8 bytes starting at [o1-8] are available in d0
- ! At least one, and possibly all, of these need to be written.
-
- cmp %o2, BLOCK_SIZE
- bgu %ncc, .large ! otherwise, less than 16 bytes left
-
-#if 1
-
- /* This code will use partial stores. */
-
- mov %g0, %o5
- and %o3, 7, %o3 ! Number of bytes needed to completely
- ! fill %d0 with good (unwritten) data.
-
- subcc %o2, 8, %o2 ! update count (maybe too much)
- movl %ncc, %o2, %o5
- addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0
- sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0)
-
- bz %ncc, 2f
- alignaddr %o3, %g0, %g0 ! set GSR.ALIGN
-
-1:
- deccc %o5
- ldda [%o1]ASI_FL8_P, %d2
- inc %o1
- bgu %ncc, 1b
- bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
-
-2:
- not %o3
- faligndata %d0, %d0, %d0 ! shift bytes to the left
- and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3]
- edge8n %g0, %o3, %o5
- stda %d0, [%o0]%o5, ASI_PST8_P
- brlez %o2, .exit_memcpy
- add %o0, %o3, %o0 ! update DST to last stored byte
-3:
- inc %o0
- deccc %o2
- ldub [%o1], %o3
- stb %o3, [%o0]
- bgu %ncc, 3b
- inc %o1
-
-#else
-
- andcc %o3, 7, %o5 ! Number of bytes needed to completely
- ! fill %d0 with good (unwritten) data.
- bz %ncc, 2f
- sub %o5, 8, %o3 ! -(number of good bytes in %d0)
- cmp %o2, 8
- bl,a %ncc, 3f ! Not enough bytes to fill %d0
- add %o1, %o3, %o1 ! Back up %o1
-
-1:
- deccc %o5
- ldda [%o1]ASI_FL8_P, %d2
- inc %o1
- bgu %ncc, 1b
- bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
-
-2:
- subcc %o2, 8, %o2
- std %d0, [%o0]
- bz %ncc, .exit_memcpy
- add %o0, 8, %o0
-3:
- ldub [%o1], %o3
- deccc %o2
- inc %o1
- stb %o3, [%o0]
- bgu %ncc, 3b
- inc %o0
-#endif
-
-.exit_memcpy:
- wr %o4, %g0, %fprs ! fprs = o4 restore fprs
- mov %g5, %asi ! restore %asi
- retl
- mov %g1, %o0
-
- .align ICACHE_LINE_SIZE
-.large:
- ! The following test for BSTORE_SIZE is used to decide whether
- ! to store data with a block store or with individual stores.
- ! The block store wins when the amount of data is so large
- ! that it is causes other application data to be moved out
- ! of the L1 or L2 cache.
- ! On a Panther, block store can lose more often because block
- ! store forces the stored data to be removed from the L3 cache.
- !
- sethi %hi(BSTORE_SIZE),%o5
- or %o5,%lo(BSTORE_SIZE),%o5
- cmp %o2, %o5
- bgu %ncc, .xlarge
-
- ! %o0 I/O DST is 64-byte aligned
- ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
- ! %d0 I/O already loaded with SRC data from [%o1-8]
- ! %o2 I/O count (number of bytes that need to be written)
- ! %o3 I Not written. If zero, then SRC is double aligned.
- ! %o4 I Not written. Holds fprs.
- ! %o5 O The number of doubles that remain to be written.
-
- ! Load the rest of the current block
- ! Recall that %o1 is further into SRC than %o0 is into DST
-
- prefetch [%o0 + (0 * BLOCK_SIZE)], #n_writes
- prefetch [%o0 + (1 * BLOCK_SIZE)], #n_writes
- prefetch [%o0 + (2 * BLOCK_SIZE)], #n_writes
- ldda [%o1]%asi, %d2
- prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
- ldda [%o1 + 0x8]%asi, %d4
- faligndata %d0, %d2, %d16
- ldda [%o1 + 0x10]%asi, %d6
- faligndata %d2, %d4, %d18
- ldda [%o1 + 0x18]%asi, %d8
- faligndata %d4, %d6, %d20
- ldda [%o1 + 0x20]%asi, %d10
- or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
- prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
- faligndata %d6, %d8, %d22
- ldda [%o1 + 0x28]%asi, %d12
- movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter)
- faligndata %d8, %d10, %d24
- ldda [%o1 + 0x30]%asi, %d14
- faligndata %d10, %d12, %d26
- ldda [%o1 + 0x38]%asi, %d0
- sub %o2, BLOCK_SIZE, %o2 ! update count
- prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
- add %o1, BLOCK_SIZE, %o1 ! update SRC
-
- ! Main loop. Write previous block. Load rest of current block.
- ! Some bytes will be loaded that won't yet be written.
-1:
- ldda [%o1]%asi, %d2
- faligndata %d12, %d14, %d28
- ldda [%o1 + 0x8]%asi, %d4
- faligndata %d14, %d0, %d30
- std %d16, [%o0]
- std %d18, [%o0+8]
- std %d20, [%o0+16]
- std %d22, [%o0+24]
- std %d24, [%o0+32]
- std %d26, [%o0+40]
- std %d28, [%o0+48]
- std %d30, [%o0+56]
- sub %o2, BLOCK_SIZE, %o2 ! update count
- prefetch [%o0 + (6 * BLOCK_SIZE)], #n_writes
- prefetch [%o0 + (3 * BLOCK_SIZE)], #n_writes
- add %o0, BLOCK_SIZE, %o0 ! update DST
- ldda [%o1 + 0x10]%asi, %d6
- faligndata %d0, %d2, %d16
- ldda [%o1 + 0x18]%asi, %d8
- faligndata %d2, %d4, %d18
- ldda [%o1 + 0x20]%asi, %d10
- faligndata %d4, %d6, %d20
- ldda [%o1 + 0x28]%asi, %d12
- faligndata %d6, %d8, %d22
- ldda [%o1 + 0x30]%asi, %d14
- faligndata %d8, %d10, %d24
- ldda [%o1 + 0x38]%asi, %d0
- faligndata %d10, %d12, %d26
- cmp %o2, BLOCK_SIZE + 8
- prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
- bgu,pt %ncc, 1b
- add %o1, BLOCK_SIZE, %o1 ! update SRC
- faligndata %d12, %d14, %d28
- faligndata %d14, %d0, %d30
- stda %d16, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
- cmp %o2, BLOCK_SIZE
- bne %ncc, 2f ! exactly 1 block remaining?
- add %o0, BLOCK_SIZE, %o0 ! update DST
- brz,a %o3, 3f ! is SRC double aligned?
- ldd [%o1], %d2
-
-2:
- add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
- add %o5, %o3, %o5
-
- ba .beginmedloop
- andn %o5, 7, %o5 ! 8 byte aligned count
-
- ! This is when there is exactly 1 block remaining and SRC is aligned
-3:
- ! %d0 was loaded in the last iteration of the loop above, and
- ! %d2 was loaded in the branch delay slot that got us here.
- ldd [%o1 + 0x08], %d4
- ldd [%o1 + 0x10], %d6
- ldd [%o1 + 0x18], %d8
- ldd [%o1 + 0x20], %d10
- ldd [%o1 + 0x28], %d12
- ldd [%o1 + 0x30], %d14
- stda %d0, [%o0]ASI_BLK_P
-
- ba .exit_memcpy
- nop
-
-
- .align 16
- ! two nops here causes loop starting at 1f below to be
- ! on a cache line boundary, improving performance
- nop
- nop
-xlarge:
-.xlarge:
- /*
- set 4096, %l2
- subcc %o2, %l2, %g0
- bge %ncc, size_ge_4k
- nop
- */
- ! %o0 I/O DST is 64-byte aligned
- ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
- ! %d0 I/O already loaded with SRC data from [%o1-8]
- ! %o2 I/O count (number of bytes that need to be written)
- ! %o3 I Not written. If zero, then SRC is double aligned.
- ! %o4 I Not written. Holds fprs.
- ! %o5 O The number of doubles that remain to be written.
-
- ! Load the rest of the current block
- ! Recall that %o1 is further into SRC than %o0 is into DST
-
- ! prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
- ! executed in delay slot for branch to .xlarge
- prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
- prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
- ldda [%o1]%asi, %d2
- prefetch [%o1 + (6 * BLOCK_SIZE)], #one_read
- ldda [%o1 + 0x8]%asi, %d4
- faligndata %d0, %d2, %d16
- ldda [%o1 + 0x10]%asi, %d6
- faligndata %d2, %d4, %d18
- ldda [%o1 + 0x18]%asi, %d8
- faligndata %d4, %d6, %d20
- ldda [%o1 + 0x20]%asi, %d10
- or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
- faligndata %d6, %d8, %d22
- ldda [%o1 + 0x28]%asi, %d12
- movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later)
- faligndata %d8, %d10, %d24
- ldda [%o1 + 0x30]%asi, %d14
- faligndata %d10, %d12, %d26
- ldda [%o1 + 0x38]%asi, %d0
- sub %o2, BLOCK_SIZE, %o2 ! update count
- prefetch [%o1 + (7 * BLOCK_SIZE)], #one_read
- add %o1, BLOCK_SIZE, %o1 ! update SRC
-
- ! This point is 32-byte aligned since 24 instructions appear since
- ! the previous alignment directive.
-
-
- ! Main loop. Write previous block. Load rest of current block.
- ! Some bytes will be loaded that won't yet be written.
-1:
- ldda [%o1]%asi, %d2
- faligndata %d12, %d14, %d28
- ldda [%o1 + 0x8]%asi, %d4
- faligndata %d14, %d0, %d30
- stda %d16, [%o0]ASI_BLK_P
- sub %o2, BLOCK_SIZE, %o2 ! update count
- ldda [%o1 + 0x10]%asi, %d6
- faligndata %d0, %d2, %d16
- ldda [%o1 + 0x18]%asi, %d8
- faligndata %d2, %d4, %d18
- ldda [%o1 + 0x20]%asi, %d10
- faligndata %d4, %d6, %d20
- ldda [%o1 + 0x28]%asi, %d12
- faligndata %d6, %d8, %d22
- ldda [%o1 + 0x30]%asi, %d14
- faligndata %d8, %d10, %d24
- ldda [%o1 + 0x38]%asi, %d0
- faligndata %d10, %d12, %d26
- ! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
- prefetch [%o1 + (8 * BLOCK_SIZE) + 8], #one_read
- add %o0, BLOCK_SIZE, %o0 ! update DST
- cmp %o2, BLOCK_SIZE + 8
- ! second prefetch important to correct for occasional dropped
- ! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
- ! strong prefetch prevents drops on Panther, but Jaguar and earlier
- ! US-III models treat strong prefetches as weak prefetchs
- ! to avoid regressions on customer hardware, we retain the prefetch
- prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
- bgu,pt %ncc, 1b
- add %o1, BLOCK_SIZE, %o1 ! update SRC
-
- faligndata %d12, %d14, %d28
- faligndata %d14, %d0, %d30
- stda %d16, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
- cmp %o2, BLOCK_SIZE
- bne %ncc, 2f ! exactly 1 block remaining?
- add %o0, BLOCK_SIZE, %o0 ! update DST
- brz,a %o3, 3f ! is SRC double aligned?
- ldd [%o1], %d2
-
-2:
- add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
- add %o5, %o3, %o5
-
-
- ba .beginmedloop
- andn %o5, 7, %o5 ! 8 byte aligned count
-
-
- ! This is when there is exactly 1 block remaining and SRC is aligned
-3:
- ! %d0 was loaded in the last iteration of the loop above, and
- ! %d2 was loaded in the branch delay slot that got us here.
- ldd [%o1 + 0x08], %d4
- ldd [%o1 + 0x10], %d6
- ldd [%o1 + 0x18], %d8
- ldd [%o1 + 0x20], %d10
- ldd [%o1 + 0x28], %d12
- ldd [%o1 + 0x30], %d14
- stda %d0, [%o0]ASI_BLK_P
-
- ba .exit_memcpy
- nop
-
-copying_ge_512:
- mov %o0, %o5 ! save dst address for return value.
- ! both src and dst are aligned to 8 byte boundary.
- save %sp, -SA(STACK_OFFSET + SAVESIZE), %sp
- mov %i0, %o0
- mov %i1, %o1
- mov %i2, %o2
- mov %i3, %o3
- mov %i5, %o5
-#ifndef __sparcv9
- stx %g4, [%sp + STACK_OFFSET + g4_offset]
- stx %g5, [%sp + STACK_OFFSET + g5_offset]
-#endif
- rd %fprs, %g5 ! check for unused fp
- andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
- bz,a %ncc, 1f
- wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
-1:
- !predfetch src buf
- sub %o1,1,%o3
- andn %o3,0x7f,%l1
- add %l1,128,%l1
- prefetch [%l1],2 !prefetch next 128b
- prefetch [%l1+64],2
- prefetch [%l1+(2*64)],2 !cont from above
- prefetch [%l1+(3*64)],2
- !predfetch dst buf
- sub %o5,1,%o3
- andn %o3,0x7f,%l1
- add %l1,128,%l1
- prefetch [%l1],2 !prefetch next 128b
- prefetch [%l1+64],2
- prefetch [%l1+(2*64)],2 !cont from above
- prefetch [%l1+(3*64)],2
-
- andcc %o5,0x7f,%o3 !o3=0 , means it is already 128 align
- brz,pn %o3,aligned_on_128
- sub %o3,128,%o3
-
- add %o2,%o3,%o2
-align_to_128:
- ldxa [%o1]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o1,8,%o1 ! increment src pointer
- stxa %o4,[%o5]ASI_CACHE_SPARING_PRIMARY
- addcc %o3,8,%o3
- bl,pt %ncc,align_to_128
- add %o5,8,%o5 ! increment dst pointer
-
-aligned_on_128:
- andcc %o5,0x1ff,%o3 !%o3=0 when it is 512 b aligned.
- brnz,pn %o3, 4f
- mov %o2,%l4 !l4=count from 512 align
- set 4096, %l2
- subcc %o2, %l2, %g0
- bge,pn %ncc, stingray_optimized_copy
- nop
-4:
-
- sub %o5,8,%l6 !should be in current 512 chunk
- andn %l6,0x1ff,%o3 !%o3=aligned 512b addr
- add %o3,0x200,%o3 !%o3=next aligned 512b addr to start
- ! stingray_optimized_copy
- sub %o3,%o5,%o3 !o3=how many byte in the current remaining chunk
- sub %o2,%o3,%l4 !l4=count from 512 align
- /*
- * if l4 is < 4096 do interleave_128_copy only.
- */
- set 4096, %l2
- subcc %l4, %l2, %g0
- bge,pn %ncc,6f
- nop
- mov %g0, %l4
- add %o5, %o2, %l1
- ba interleave_128_copy
- nop
-6:
- mov %o3, %o2
- subcc %o3,256,%g0 ! if it is > 256 bytes , could use the
- ! interleave_128_copy
- bl,pn %ncc,copy_word ! o.w use copy_word to finish the 512 byte
- ! alignment.
- !%o1=64 bytes data
- !%o5=next 8 byte addr to write
- !%o2=new count i.e how many bytes to write
- add %o5,%o2,%l1 !cal the last byte to write %l1
- ba interleave_128_copy
- nop
-
- .align 64
-interleave_128_copy:
- ! %l1 has the addr of the dest. buffer at or beyond which no write
- ! is to be done.
- ! %l4 has the number of bytes to zero using stingray_optimized_bzero
- !prefetch src
- !prefetch src
-
- add %o1, 256, %o3
- prefetch [%o3], 2 !1st 64 byte line of next 256 byte block
- add %o1, 384, %o3
- prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block
- add %o1, 320, %o3
- prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block
- add %o1, 448, %o3
- prefetch [%o3], 2 !4th 64 byte line of next 256 byte block
-
- !prefetch dst
-
- add %o5, 256, %o3
- prefetch [%o3], 2 !1st 64 byte line of next 256 byte block
- add %o5, 384, %o3
- prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block
- add %o5, 320, %o3
- prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block
- add %o5, 448, %o3
- prefetch [%o3], 2 !4th 64 byte line of next 256 byte block
-
- ldxa [%o1]ASI_CACHE_SPARING_PRIMARY, %o4
- stxa %o4,[%o5]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line
- add %o1, 128, %o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, 128, %o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line
- add %o1, (1 * 8), %o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (1 * 8), %o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (1 * 8 + 128), %o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (1 * 8 + 128), %o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (2 * 8),%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (2 * 8),%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (2 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (2 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (3 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (3 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (3 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (3 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (4 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (4 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (4 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (4 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (5 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (5 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (5 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (5 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (6 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (6 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (6 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (6 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (7 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (7 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (7 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (7 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (8 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (8 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (8 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (8 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (9 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (9 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (9 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (9 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (10 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (10 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (10 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (10 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (11 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (11 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (11 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (11 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (12 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (12 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (12 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (12 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (13 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (13 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (13 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (13 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (14 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (14 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (14 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (14 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (15 * 8) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (15 * 8) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, (15 * 8 + 128) ,%o3
- ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o5, (15 * 8 + 128) ,%o3
- stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o1, 256, %o1
- ! check if the next 256 byte copy will not exceed the number of
- ! bytes remaining to be copied.
- ! %l2 points to the dest buffer after copying 256 bytes more.
- ! %l1 points to dest. buffer at or beyond which no writes should be done.
- add %o5,512,%l2
-
- subcc %l1,%l2,%g0
- bge,pt %ncc,interleave_128_copy
- add %o5,256,%o5
-
-copy_word:
- and %o2,255,%o3
- and %o3,7,%o2
-
- ! Set the remaining doubles
- subcc %o3, 8, %o3 ! Can we store any doubles?
- bl,pn %ncc, 6f
- and %o2, 7, %o2 ! calc bytes left after doubles
-
- !prefetch src
-
- mov %o1, %o4
- prefetch [%o4], 2 !1st 64 byte line of next 256 byte block
- add %o1, 128, %o4
- prefetch [%o4], 2 !3rd 64 byte line of next 256 byte block
- add %o1, 64, %o4
- prefetch [%o4], 2 !2nd 64 byte line of next 256 byte block
- add %o1, 192, %o4
- prefetch [%o4], 2 !4th 64 byte line of next 256 byte block
-
- !prefetch dst
-
- mov %o5, %o4
- prefetch [%o4], 2 !1st 64 byte line of next 256 byte block
- add %o5, 128, %o4
- prefetch [%o4], 2 !3rd 64 byte line of next 256 byte block
- add %o5, 64, %o4
- prefetch [%o4], 2 !2nd 64 byte line of next 256 byte block
- add %o5, 192, %o4
- prefetch [%o4], 2 !4th 64 byte line of next 256 byte block
-
-5:
- ldxa [%o1]ASI_CACHE_SPARING_PRIMARY, %o4
- add %o1, 8, %o1
- stxa %o4, [%o5]ASI_CACHE_SPARING_PRIMARY
- subcc %o3, 8, %o3
- bge,pt %ncc, 5b
- add %o5, 8, %o5
-6:
- ! Set the remaining bytes
- brz %o2, can_we_do_stingray_optimized_copy
- nop
-
- ! Terminate the copy with a partial store.
- ! The data should be at d0
- ldxa [%o1]ASI_CACHE_SPARING_PRIMARY, %o4
- stx %o4, [%sp + STACK_OFFSET + scratch_offset]
- ldd [%sp + STACK_OFFSET + scratch_offset], %d0
-
- dec %o2 ! needed to get the mask right
- edge8n %g0, %o2, %o4
- stda %d0, [%o5]%o4, ASI_PST8_P
-can_we_do_stingray_optimized_copy:
- mov %l4, %o2
- brnz,pn %o2, stingray_optimized_copy
- nop
-
-exit:
- brnz %g5, 1f
- nop
- wr %g5, %g0, %fprs
-1:
-#ifndef __sparcv9
- ldx [%sp + STACK_OFFSET + g4_offset], %g4
- ldx [%sp + STACK_OFFSET + g5_offset], %g5
-#endif
- ret ! %o0 was preserved
- restore
-
-
-stingray_optimized_copy:
-!%o5 = next memory addr which is 512 b align
-!%l4 = remaining byte from 512 align.
-
- add %o5, %l4, %o2
-
- prefetch [%o1+0],2
- prefetch [%o1+(64*1)],2
- prefetch [%o1+(64*2)],2
- prefetch [%o1+(64*3)],2
- prefetch [%o1+(64*4)],2
- prefetch [%o1+(64*5)],2
- prefetch [%o1+(64*6)],2
- prefetch [%o1+(64*7)],2
- prefetch [%o1+(64*8)],2
- prefetch [%o1+(64*9)],2
- prefetch [%o1+(64*10)],2
- prefetch [%o1+(64*11)],2
- prefetch [%o1+(64*12)],2
- prefetch [%o1+(64*13)],2
- prefetch [%o1+(64*14)],2
- prefetch [%o1+(64*15)],2
-
- prefetch [%o5+0],2
- prefetch [%o5+(64*1)],2
- prefetch [%o5+(64*2)],2
- prefetch [%o5+(64*3)],2
- prefetch [%o5+(64*4)],2
- prefetch [%o5+(64*5)],2
- prefetch [%o5+(64*6)],2
- prefetch [%o5+(64*7)],2
- prefetch [%o5+(64*8)],2
- prefetch [%o5+(64*9)],2
- prefetch [%o5+(64*10)],2
- prefetch [%o5+(64*11)],2
- prefetch [%o5+(64*12)],2
- prefetch [%o5+(64*13)],2
- prefetch [%o5+(64*14)],2
- prefetch [%o5+(64*15)],2
-
- ba myloop2
- srl %l4, 12, %l4
-
- ! Local register usage:
- !
- ! %l1 address at short distance ahead of current %o1 for prefetching
- ! into L1 cache.
- ! %l2 address at far ahead of current %o1 for prefetching into L2 cache.
- ! %l3 save %o5 at start of inner loop.
- ! %l4 Number of 4k blocks to copy
- ! %g1 save %o1 at start of inner loop.
- ! %l5 iteration counter to make buddy loop execute 2 times.
- ! %l6 iteration counter to make inner loop execute 32 times.
- ! %l7 address at far ahead of current %o5 for prefetching destination
- ! into L2 cache.
-
-.align 64
-myloop2:
- set 2,%l5 ! %l5 is the loop count for the buddy loop, for 2 buddy lines.
- add %o5, 0, %l3
- add %o1, 0, %g1
-buddyloop:
- set PF_FAR, %g4 ! Prefetch far ahead. CHANGE FAR PREFETCH HERE.
- add %o1, %g4, %l2 ! For prefetching far ahead, set %l2 far ahead
- ! of %o1
- add %o1, PF_NEAR, %l1 ! For prefetching into L1 D$, set %l1 a
- ! little ahead of %o1
- add %o5, %g4, %l7 ! For prefetching far ahead, set %l7 far ahead
- ! of %o5
-
- add %l2, %g4, %g4 ! %g4 is now double far ahead of the source
- ! address in %o1.
- prefetch [%g4+%g0],2 ! Prefetch ahead by several pages to get TLB
- ! entry in advance.
- set 2*PF_FAR, %g4 ! Prefetch double far ahead. SET DOUBLE FAR
- ! PREFETCH HERE.
- add %o5, %g4, %g4 ! %g4 is now double far ahead of the dest
- ! address in %o5.
- prefetch [%g4+%g0],2 ! Prefetch ahead by 2 pages to get TLB entry
- ! in advance.
-
- set 4,%l6 ! %l6 = loop count for the inner loop,
- ! for 4 x 8 = 32 lines.
- set 0, %g4
-
- ! Each iteration of the inner loop below copies 8 sequential lines.
- ! This loop is iterated 4 times, to move a total of 32 lines,
- ! all of which have the same value of PA[9], so we increment the base
- ! address by 1024 bytes in each iteration, which varies PA[10]. */
-innerloop:
- /* ---- copy line 1 of 8. ---- */
- prefetch [%l2+%g4],2
- prefetch [%l7+%g4],2
- prefetch [%l1+%g4],1
-
- ldd [%o1],%d0
- ldd [%o1+8],%d2
- ldd [%o1+16],%d4
- ldd [%o1+24],%d6
- ldd [%o1+32],%d8
- ldd [%o1+40],%d10
- ldd [%o1+48],%d12
- ldd [%o1+56],%d14
- stda %d0,[%o5]ASI_BLK_P
- add %g4, 64, %g4
- add %o5, 64, %o5
- add %o1, 64, %o1 /* increment %o1 for the next source line. */
-
- /* ---- copy line 2 of 8. ---- */
- prefetch [%l2+%g4],2
- prefetch [%l7+%g4],2
- prefetch [%l1+%g4],1
-
- ldd [%o1],%d0
- ldd [%o1+8],%d2
- ldd [%o1+16],%d4
- ldd [%o1+24],%d6
- ldd [%o1+32],%d8
- ldd [%o1+40],%d10
- ldd [%o1+48],%d12
- ldd [%o1+56],%d14
- stda %d0,[%o5]ASI_BLK_P
- add %g4, 64, %g4
- add %o5, 64, %o5
- add %o1, 64, %o1 /* increment %o1 for the next source line. */
-
- /* ---- copy line 3 of 8. ---- */
- prefetch [%l2+%g4],2
- prefetch [%l7+%g4],2
- prefetch [%l1+%g4],1
-
- ldd [%o1],%d0
- ldd [%o1+8],%d2
- ldd [%o1+16],%d4
- ldd [%o1+24],%d6
- ldd [%o1+32],%d8
- ldd [%o1+40],%d10
- ldd [%o1+48],%d12
- ldd [%o1+56],%d14
- stda %d0,[%o5]ASI_BLK_P
- add %g4, 64, %g4
- add %o5, 64, %o5
- add %o1, 64, %o1 /* increment %o1 for the next source line. */
-
- /* ---- copy line 4 of 8. ---- */
- prefetch [%l2+%g4],2
- prefetch [%l7+%g4],2
- prefetch [%l1+%g4],1
-
- ldd [%o1],%d0
- ldd [%o1+8],%d2
- ldd [%o1+16],%d4
- ldd [%o1+24],%d6
- ldd [%o1+32],%d8
- ldd [%o1+40],%d10
- ldd [%o1+48],%d12
- ldd [%o1+56],%d14
- stda %d0,[%o5]ASI_BLK_P
- add %g4, 64, %g4
- add %o5, 64, %o5
- add %o1, 64, %o1 /* increment %o1 for the next source line. */
-
- /* ---- copy line 5 of 8. ---- */
- prefetch [%l2+%g4],2
- prefetch [%l7+%g4],2
- prefetch [%l1+%g4],1
-
- ldd [%o1],%d0
- ldd [%o1+8],%d2
- ldd [%o1+16],%d4
- ldd [%o1+24],%d6
- ldd [%o1+32],%d8
- ldd [%o1+40],%d10
- ldd [%o1+48],%d12
- ldd [%o1+56],%d14
- stda %d0,[%o5]ASI_BLK_P
- add %g4, 64, %g4
- add %o5, 64, %o5
- add %o1, 64, %o1 /* increment %o1 for the next source line. */
-
- /* ---- copy line 6 of 8. ---- */
- prefetch [%l2+%g4],2
- prefetch [%l7+%g4],2
- prefetch [%l1+%g4],1
-
- ldd [%o1],%d0
- ldd [%o1+8],%d2
- ldd [%o1+16],%d4
- ldd [%o1+24],%d6
- ldd [%o1+32],%d8
- ldd [%o1+40],%d10
- ldd [%o1+48],%d12
- ldd [%o1+56],%d14
- stda %d0,[%o5]ASI_BLK_P
- add %g4, 64, %g4
- add %o5, 64, %o5
- add %o1, 64, %o1 /* increment %o1 for the next source line. */
-
- /* ---- copy line 7 of 8. ---- */
- prefetch [%l2+%g4],2
- prefetch [%l7+%g4],2
- prefetch [%l1+%g4],1
-
- ldd [%o1],%d0
- ldd [%o1+8],%d2
- ldd [%o1+16],%d4
- ldd [%o1+24],%d6
- ldd [%o1+32],%d8
- ldd [%o1+40],%d10
- ldd [%o1+48],%d12
- ldd [%o1+56],%d14
- stda %d0,[%o5]ASI_BLK_P
- add %g4, 64, %g4
- add %o5, 64, %o5
- add %o1, 64, %o1 /* increment %o1 for the next source line. */
-
- /* ---- copy line 8 of 8. ---- */
- prefetch [%l2+%g4],2
- prefetch [%l7+%g4],2
- prefetch [%l1+%g4],1
-
- ldd [%o1],%d0
- ldd [%o1+8],%d2
- ldd [%o1+16],%d4
- ldd [%o1+24],%d6
- ldd [%o1+32],%d8
- ldd [%o1+40],%d10
- ldd [%o1+48],%d12
- ldd [%o1+56],%d14
- stda %d0,[%o5]ASI_BLK_P
-
- subcc %l6,1,%l6 /* Decrement the inner loop counter. */
-
- ! Now increment by 64 + 512 so we don't toggle PA[9]
- add %g4, 576, %g4
- add %o5, 576, %o5
-
- bg,pt %icc,innerloop
- add %o1, 576, %o1 ! increment %o1 for the next source line.
- ! END OF INNER LOOP
-
-
- subcc %l5,1,%l5
- add %l3, 512, %o5 ! increment %o5 to first buddy line of dest.
- bg,pt %icc,buddyloop
- add %g1, 512 ,%o1 ! Set %o1 to the first of the odd buddy lines.
-
- subcc %l4, 1, %l4
- add %o5, 3584, %o5 ! Advance both base addresses to 4k above where
- ! they started.
- add %o1, 3584, %o1 ! They were already incremented by 512,
- ! so just add 3584.
-
- bg,pt %icc,myloop2
- nop
-
- /****larryalg_end_here*************/
-
- sub %o2,%o5,%o2 !how many byte left
- brz,pn %o2,complete_write
- mov %g0,%l4
- add %o5,%o2,%l1 !cal the last byte to write %l1
- subcc %o2,256,%g0
- bge,pt %ncc,interleave_128_copy
- mov %g0,%l4
-
- ba copy_word
- nop
-
-
-complete_write:
- ba exit
- nop
-
-
-
- SET_SIZE(memcpy)
- SET_SIZE(__align_cpy_1)
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s
deleted file mode 100644
index f9e0f62ac9..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s
+++ /dev/null
@@ -1,767 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-
- .file "memset.s"
-/*
- * char *memset(sp, c, n)
- *
- * Set an array of n chars starting at sp to the character c.
- * Return sp.
- *
- * Fast assembler language version of the following C-program for memset
- * which represents the `standard' for the C-library.
- *
- * void *
- * memset(void *sp1, int c, size_t n)
- * {
- * if (n != 0) {
- * char *sp = sp1;
- * do {
- * *sp++ = (char)c;
- * } while (--n != 0);
- * }
- * return (sp1);
- * }
- */
-
-#include <sys/asm_linkage.h>
-#include <sys/sun4asi.h>
-
- ANSI_PRAGMA_WEAK(memset,function)
-
-#define SAVESIZE (8 * 1)
-#ifdef __sparcv9
-#define STACK_OFFSET (STACK_BIAS + 0)
-#else
-#define STACK_OFFSET (STACK_BIAS + 0 + 0)
-#endif
-#define scratch_offset 0
-
-#define ASI_CACHE_SPARING_PRIMARY 0xf4
-#define ALIGN8(X) (((X) + 7) & ~7)
-#define ICACHE_LINE_SIZE 64
-#define FPRS_FEF 0x4
-#define PF_FAR 2048
-
- .section ".text"
- .align ICACHE_LINE_SIZE
-
- /*
- * Optimizations done:
- *
- * No stores in delay slot of branch instructions.
- * conditional stores where possible
- * prefetch before doing stxa
- * Bank interleaved writing.
- */
-
- ENTRY(memset)
- add %sp, -SA(STACK_OFFSET + SAVESIZE), %sp
- mov %o0, %o5 ! copy sp1 before using it
- /*
- * If 0 bytes to xfer return
- */
- brnz %o2, continue
- nop
- retl
- add %sp, SA(STACK_OFFSET + SAVESIZE), %sp
-continue:
- /*
- * If the count is multiple of 8 and buffer is aligned to 8
- * we don't have to look at fprs
- */
- or %o5, %o2, %o3
- and %o3, 7, %o3
- brnz %o3, check_fprs
- mov 4, %g1
- prefetch [%o5],2
- ba skip_rd_fprs
- nop
-
-check_fprs:
- rd %fprs, %g1 ! g1 = fprs
-skip_rd_fprs:
- prefetch [%o5],2
- andcc %g1, 0x4, %g1 ! fprs.du = fprs.dl = 0
- bnz %ncc, 1f ! Is fprs.fef == 1
- nop
- wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
-1:
- and %o1, 0xff, %o1 ! o1 is (char)c
- sll %o1, 8, %o3
- or %o1, %o3, %o1 ! now o1 has 2 bytes of c
- sll %o1, 16, %o3
- or %o1, %o3, %o1 ! now o1 has 4 bytes of c
- sllx %o1, 32, %o3
- or %o1, %o3, %o1 ! now o1 has 8 bytes of c
- stx %o1, [%sp + STACK_OFFSET + scratch_offset]
- ldd [%sp + STACK_OFFSET + scratch_offset], %d0
- cmp %o2, 8
- bge,pt %ncc, xfer_8_or_more
- mov %o0, %o5
- /*
- * Do a partial store of %o2 bytes
- */
- andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound
- brz,pt %o3, aligned_on_8
- sub %o5, %o3, %o5 ! align the destination buffer.
- mov %o3, %o1
- mov 8, %o4
- sub %o4, %o3, %o3
- cmp %o3, %o2
- bg,a,pn %ncc, 1f
- mov %o2, %o3
-1:
- ! %o3 has the bytes to be written in partial store.
- sub %o2, %o3, %o2
- dec %o3
- prefetch [%o5],2
- edge8n %g0, %o3, %o4
- srl %o4, %o1, %o4
- stda %d0, [%o5]%o4, ASI_PST8_P
- brz %o2, simple_ret
- add %o5, 8, %o5
-aligned_on_8:
- prefetch [%o5],2
- dec %o2 ! needed to get the mask right
- edge8n %g0, %o2, %o4
- stda %d0, [%o5]%o4, ASI_PST8_P
- brnz %g1, 1f ! was fprs.fef == 1
- nop
- wr %g1, %g0, %fprs ! fprs = g1 restore fprs
-1:
- retl
- add %sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-xfer_8_or_more:
- andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound
- brz,pt %o3, blkchk
- sub %o5, %o3, %o5 ! align the destination buffer.
- sub %o3, 8, %o3 ! -(bytes till double aligned)
- add %o2, %o3, %o2 ! update o2 with new count
- xor %o3, 0xff, %o3
- and %o3, 7, %o3
- prefetch [%o5],2
- edge8ln %g0, %o3, %o4
- stda %d0, [%o5]%o4, ASI_PST8_P
- add %o5, 8, %o5
-
-
- ! Now sp1 is double aligned (sp1 is found in %o5)
-blkchk:
- cmp %o2, 767 ! if large count use Block ld/st
- bg,pt %ncc,blkwr
- nop
-
-
- and %o2, 24, %o3 ! o3 is {0, 8, 16, 24}
-
- brz %o3, skip_dw_loop
- nop
-
-1: subcc %o3, 8, %o3 ! double-word loop
- stx %o1, [%o5]
- bgu,pt %ncc, 1b
- add %o5, 8, %o5
-skip_dw_loop:
- andncc %o2, 31, %o4 ! o4 has 32 byte aligned count
- brz,pn %o4, 3f
- nop
- ba loop_32byte
- nop
-
- .align ICACHE_LINE_SIZE
-
-loop_32byte:
- subcc %o4, 32, %o4 ! main loop, 32 bytes per iteration
- stx %o1, [%o5]
- stx %o1, [%o5 + 8]
- stx %o1, [%o5 + 16]
- stx %o1, [%o5 + 24]
- bne,pt %ncc, loop_32byte
- add %o5, 32, %o5
-3:
- and %o2, 7, %o2 ! o2 has the remaining bytes (<8)
- brz %o2, skip_partial_copy
- nop
-
- ! Terminate the copy with a partial store.
- ! The data should be at d0
- prefetch [%o5],2
- dec %o2 ! needed to get the mask right
- edge8n %g0, %o2, %o4
- stda %d0, [%o5]%o4, ASI_PST8_P
-
-skip_partial_copy:
-simple_ret:
- brz,a %g1, 1f ! was fprs.fef == 0
- wr %g1, %g0, %fprs ! fprs = g1 restore fprs
-1:
- retl
- add %sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-blkwr:
- sub %o5,1,%o3
- andn %o3,0x7f,%o4
- add %o4,128,%o4
- prefetch [%o4],2 !prefetch next 128b
- prefetch [%o4+64],2
- prefetch [%o4+(2*64)],2 !cont from above
- prefetch [%o4+(3*64)],2
-
- andcc %o5,0x7f,%o3 !o3=0 , means it is already 128 align
- brz,pn %o3,alreadyalign128
- sub %o3,128,%o3
-
- add %o2,%o3,%o2
-align128:
- stxa %o1,[%o5]ASI_CACHE_SPARING_PRIMARY
- addcc %o3,8,%o3
- bl,pt %ncc,align128
- add %o5,8,%o5
-
-
-
-alreadyalign128:
- andcc %o5,0x1ff,%o3 !%o3=0 when it is 512 b aligned.
- brnz,pn %o3, 4f
- mov %o2,%g5 !g5=count from 512 align
- set 4096, %o4
- subcc %o2, %o4, %g0
- bge,pn %ncc, larry_alg
- nop
-4:
-
- sub %o5,8,%o4 !should be in current 512 chunk
- andn %o4,0x1ff,%o3 !%o3=aligned 512b addr
- add %o3,0x200,%o3 !%o3=next aligned 512b addr which start larry process
- sub %o3,%o5,%o3 !o3=how many byte in the current remaining chunk
- sub %o2,%o3,%g5 !g5=count from 512 align
- /*
- * if g5 is < 4096 do start_128 only.
- */
- set 4096, %o4
- subcc %g5, %o4, %g0
- bge,pn %ncc,6f
- nop
- mov %g0, %g5
- add %o5, %o2, %o4
- ba start_128
- nop
-6:
- mov %o3, %o2
- subcc %o3,256,%g0 !if it is > 256 bytes , could use the st-interleave alg to wr
- bl,pn %ncc,storeword !o.w use storeword to finish the 512 byte alignment.
- !%o1=64 bytes data
- !%o5=next 8 byte addr to write
- !%o2=new count i.e how many bytes to write
- add %o5,%o2,%o4 !cal the last byte to write %o4
- ba start_128
- nop
-
- .align 64
-start_128:
- add %o5, 256, %o3
- prefetch [%o3], 2 !1st 64 byte line of next 256 byte block
- add %o5, 384, %o3
- prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block
- add %o5, 320, %o3
- prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block
- add %o5, 448, %o3
- prefetch [%o3], 2 !4th 64 byte line of next 256 byte block
- mov %o5, %o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line
- add %o5,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line
- add %o5,8,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(2 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128 ,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(3 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(4 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(5 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(6 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(7 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(8 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(9 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(10 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(11 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(12 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(13 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(14 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(15 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,512,%o3 !%o3=final byte of next 256 byte, to check if more 256 byte block ahead
- subcc %o4,%o3,%g0 !%o4=final byte location;%o3=final byte of next 256 byte block
- bge,pt %ncc,start_128 !branch taken means next 256 byte block is still within the limit.
- add %o5,256,%o5
-
-!need to connect the rest of the program
-storeword:
- and %o2,255,%o3
- and %o3,7,%o2
-
- ! Set the remaining doubles
- subcc %o3, 8, %o3 ! Can we store any doubles?
- bl,pn %ncc, 6f
- and %o2, 7, %o2 ! calc bytes left after doubles
-
-5:
- stxa %o1, [%o5]ASI_CACHE_SPARING_PRIMARY
- subcc %o3, 8, %o3
- bge,pt %ncc, 5b
- add %o5, 8, %o5
-6:
- ! Set the remaining bytes
- brz %o2, check_larry_alg ! safe to check all 64-bits
-
- ! Terminate the copy with a partial store.
- ! The data should be at d0
- dec %o2 ! needed to get the mask right
- edge8n %g0, %o2, %o4
- stda %d0, [%o5]%o4, ASI_PST8_P
-check_larry_alg:
- mov %g5, %o2
- brnz,pn %o2, larry_alg
- nop
-
-.exit:
- brz,a %g1, 1f ! was fprs.fef == 0
- wr %g1, %g0, %fprs ! fprs = g1 restore fprs
-1:
- retl ! %o0 was preserved
- add %sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-larry_alg:
- add %sp, SA(STACK_OFFSET + SAVESIZE), %sp
- save %sp, -SA(MINFRAME), %sp
- mov %i0, %o0
- mov %i1, %o1
- mov %i2, %o2
- mov %i3, %o3
- mov %i5, %o5
-!%o5 = next memory addr which is 512 b align
-!%g5 = remaining byte from 512 align.
-init:
- set 4096,%g6
-
- prefetch [%o5+0],2
- prefetch [%o5+(64*1)],2
- prefetch [%o5+(64*2)],2
- prefetch [%o5+(64*3)],2
- prefetch [%o5+(64*4)],2
- prefetch [%o5+(64*5)],2
- prefetch [%o5+(64*6)],2
- prefetch [%o5+(64*7)],2
- prefetch [%o5+(64*8)],2
- prefetch [%o5+(64*9)],2
- prefetch [%o5+(64*10)],2
- prefetch [%o5+(64*11)],2
- prefetch [%o5+(64*12)],2
- prefetch [%o5+(64*13)],2
- prefetch [%o5+(64*14)],2
- prefetch [%o5+(64*15)],2
- ba myloop2
- add %o5,%g5,%g5
- /* Local register usage:
- %l3 save %o5 at start of inner loop.
- %l5 iteration counter to make buddy loop execute 2 times.
- %l6 iteration counter to make inner loop execute 32 times.
- %l7 address at far ahead of current %o5 for prefetching destination into L2 cache.
- */
-
- .align 64
-myloop2:
- /* Section 1 */
- set 2,%l5 /* %l5 is the loop count for the buddy loop, for 2 buddy lines. */
- add %o5, 0, %l3
-buddyloop:
- set PF_FAR, %l4 /* Prefetch far ahead. CHANGE FAR PREFETCH HERE. <<==== */
- add %o5, %l4, %l7 /* For prefetching far ahead, set %l7 far ahead of %o5 */
-
- set 2*PF_FAR, %l4 /* Prefetch double far ahead. SET DOUBLE FAR PREFETCH HERE. <<==== */
- add %o5, %l4, %l4 /* %l4 is now double far ahead of the dest address in %o5. */
- prefetch [%l4+%g0],2 /* Prefetch ahead by 2 pages to get TLB entry in advance. */
-
- set 4,%l6 /* %l6 = loop count for the inner loop, for 4 x 8 = 32 lines. */
- set 0, %l4
-
-
-/* Each iteration of the inner loop below writes 8 sequential lines. This loop is iterated 4 times,
- to move a total of 32 lines, all of which have the same value of PA[9], so we increment the base
- address by 1024 bytes in each iteration, which varies PA[10]. */
-innerloop:
- add %o5, PF_FAR, %o3
- prefetch [%o3],2
- add %o3, 64, %o3
- prefetch [%o3],2
- add %o3, 64, %o3
- prefetch [%o3],2
- add %o3, 64, %o3
- prefetch [%o3],2
- add %o3, 64, %o3
- prefetch [%o3],2
- add %o3, 64, %o3
- prefetch [%o3],2
- add %o3, 64, %o3
- prefetch [%o3],2
- add %o3, 64, %o3
- prefetch [%o3],2
-
- mov %o5, %o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line
- add %o5,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line
- add %o5,8,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(2 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128 ,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(3 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(4 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(5 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(6 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(7 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(8 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(9 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(10 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(11 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(12 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(13 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(14 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(15 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-
- add %o5,256,%o5
-
- mov %o5, %o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line
- add %o5,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line
- add %o5,8,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(2 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128 ,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(3 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(4 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(5 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(6 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(7 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(8 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(9 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(10 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(11 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(12 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(13 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(14 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(15 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-
- subcc %l6,1,%l6 /* Decrement the inner loop counter. */
-
- /* -------- Now increment by 256 + 512 so we don't toggle PA[9] -------- */
- add %o5, 768, %o5
-
- bg,pt %ncc,innerloop
- nop
-/* ------------------------ END OF INNER LOOP -------------------------- */
-
- subcc %l5,1,%l5
- add %l3, 512, %o5 /* increment %o5 to first buddy line of dest. */
- bg,pt %ncc,buddyloop
- nop
- add %o5, 3584, %o5 /* Advance both base addresses to 4k above where they started. */
- !%o5=next 4096 block.
- add %o5,%g6,%i5
- subcc %g5,%i5,%g0
- bge,pt %ncc,myloop2
- nop
-
-
- /****larryalg_end_here*************/
-
- sub %g5,%o5,%o2 !how many byte left
- brz,pn %o2,complete_write
- mov %g0,%g5
- add %o5,%o2,%o4 !cal the last byte to write %o4
- subcc %o2,256,%g0
- bge,pt %ncc,memset_128
- mov %g0,%g5
-
- ba memset_storeword
- nop
-
-
-complete_write:
- brz,a %g1, 1f ! was fprs.fef == 0
- wr %g1, %g0, %fprs ! fprs = g1 restore fprs
-1:
- ret ! %o0 was preserved
- restore
-
- .align 64
-memset_128:
- add %o5, 256, %o3
- prefetch [%o3], 2 !1st 64 byte line of next 256 byte block
- add %o5, 384, %o3
- prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block
- add %o5, 320, %o3
- prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block
- add %o5, 448, %o3
- prefetch [%o3], 2 !4th 64 byte line of next 256 byte block
- mov %o5, %o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line
- add %o5,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line
- add %o5,8,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(2 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128 ,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(3 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(4 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(5 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(6 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(7 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(8 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(9 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(10 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(11 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(12 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(13 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(14 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,(15 * 8),%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
- add %o5,512,%l4 !%l4=final byte of next 256 byte, to check if more 256 byte block ahead
- add %o3,128,%o3
- stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-!this branch condition is not needed if we are handling bytes before 4096b
-!because we will only issue once, so %l6 is an invalid data
-!the branch is really for handling bytes after 4096b, there could be
-!multiple of 256 byte block to work on.
-
- subcc %o4,%l4,%g0 !%o4=final byte location;%l4=final byte of next 256 byte block
- bge,pt %ncc,memset_128 !branch taken means next 256 byte block is still within the limit.
- add %o5,256,%o5
-
-!need to connect the rest of the program
-memset_storeword:
- and %o2,255,%o3
- and %o3,7,%o2
-
- ! Set the remaining doubles
- subcc %o3, 8, %o3 ! Can we store any doubles?
- bl,pn %ncc, 6f
- and %o2, 7, %o2 ! calc bytes left after doubles
-
-5:
- stxa %o1, [%o5]ASI_CACHE_SPARING_PRIMARY
- subcc %o3, 8, %o3
- bge,pt %ncc, 5b
- add %o5, 8, %o5
-6:
- ! Set the remaining bytes
- brz %o2, complete_write ! safe to check all 64-bits
-
- ! Terminate the copy with a partial store.
- ! The data should be at d0
- dec %o2 ! needed to get the mask right
- edge8n %g0, %o2, %o4
- stda %d0, [%o5]%o4, ASI_PST8_P
-
- brz,a %g1, 1f ! was fprs.fef == 0
- wr %g1, %g0, %fprs ! fprs = g1 restore fprs
-1:
- ret ! %o0 was preserved
- restore
-
-
- SET_SIZE(memset)
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s
deleted file mode 100644
index c2b4aa4c29..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/asm_linkage.h>
-
- ENTRY(_rock_pause)
- membar #Halt
- retl
- nop
- SET_SIZE(_rock_pause)
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s
deleted file mode 100644
index a9861b1a86..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
- .file "strcpy.s"
-
-/*
- * strcpy(s1, s2)
- *
- * Copy string s2 to s1. s1 must be large enough. Return s1.
- *
- * Fast assembler language version of the following C-program strcpy
- * which represents the `standard' for the C-library.
- *
- * char *
- * strcpy(s1, s2)
- * register char *s1;
- * register const char *s2;
- * {
- * char *os1 = s1;
- *
- * while(*s1++ = *s2++)
- * ;
- * return(os1);
- * }
- *
- */
-
-#include <sys/asm_linkage.h>
-
- ! This implementation of strcpy works by first checking the
- ! source alignment and copying byte, half byte, or word
- ! quantities until the source ptr is aligned at an extended
- ! word boundary. Once this has occurred, the string is copied,
- ! checking for zero bytes, depending upon its dst ptr alignment.
- ! (methods for xword, word, half-word, and byte copies are present)
-
-#ifdef __sparcv9
-#define SAVESIZE (8 * 3)
-#define STACK_OFFSET (STACK_BIAS + MINFRAME)
-#else
-#define SAVESIZE (8 * 5)
-#define STACK_OFFSET (STACK_BIAS + MINFRAME + 4)
-#endif
-
-#define LABEL_ADDRESS(label, reg) \
- .pushlocals ;\
-0: rd %pc, reg ;\
- add reg, (label) - 0b, reg ;\
- .poplocals
-
-offset_table:
- .word .storexword - offset_table ! Offset 0 => xword aligned
- .word .storebyte1241 - offset_table ! Offset 1 or 5
- .word .storehalfword - offset_table ! Offset 2 or 6
- .word .storebyte1421 - offset_table ! Offset 3 or 7
- .word .storeword - offset_table ! Offset 4
-
- .align 64
-#ifdef __sparcv9
- .skip 20
-#else
- .skip 12
-#endif
-
- ENTRY(strcpy)
- add %sp, -SA(STACK_OFFSET + SAVESIZE), %sp
-#ifndef __sparcv9
- stx %g4, [%sp + STACK_OFFSET + 24]
- stx %g5, [%sp + STACK_OFFSET + 32]
-#endif
- sethi %hi(0x01010101), %o4 ! 0x01010000
- sub %o1, %o0, %o3 ! src - dst
- or %o4, %lo(0x01010101), %o4 ! 0x01010101
- andcc %o1, 7, %g5 ! dword aligned ?
- sllx %o4, 32, %o5 ! 0x01010101 << 32
- mov %o0, %o2 ! save dst
- or %o4, %o5, %o4 ! 0x0101010101010101
-
- bz,pt %ncc, .srcaligned ! yup
- sllx %o4, 7, %o5 ! 0x8080808080808080
-
- sub %g0, %g5, %g4 ! count = -off
- ldx [%o1 + %g4], %o1 ! val = *(addr + -off)
- mov -1, %g1 ! mask = -1
- sllx %g5, 3, %g4 ! shift = off * 8
- srlx %g1, %g4, %g1 ! -1 >> ((addr & 7) * 8)
- orn %o1, %g1, %o1 ! val |= ~mask
-
- andn %o5, %o1, %g4 ! ~val & 0x80
- sub %o1, %o4, %g1 ! val - 0x01
- andcc %g4, %g1, %g4 ! ~val & 0x80 & (val - 0x01)
-
- sllx %g5, 3, %g4
- add %o2, 8, %o2 ! .zerobyte expects address = address + 8
- bnz,a,pn %xcc, .zerobyte ! Zero byte in the first xword
- sllx %o1, %g4, %o1 ! and data to be left justified
-
- sub %o2, 8, %o2
- mov 8, %g4
- sub %g4, %g5, %g1 ! Bytes to be written
- sub %g1, 1, %g4
-
-1: stub %o1, [%o2 + %g4]
- dec %g4
- brgez,pt %g4, 1b
- srlx %o1, 8, %o1
-
- add %o2, %g1, %o2 ! Move ptr by #bytes written
-
-.srcaligned:
- !! Check if the first dword contains zero after src is aligned
- ldx [%o2 + %o3], %o1 ! x = src[]
- andn %o5, %o1, %g1 ! ~x & 0x8080808080808080
- sub %o1, %o4, %g4 ! x - 0x0101010101010101
- andcc %g4, %g1, %g0 ! ((x - 0x0101010101010101) & ~x & 0x8080808080808080)
- bnz,a,pn %xcc, .zerobyte ! x has zero byte, handle end cases
- add %o2, 8, %o2 ! src += 8, dst += 8
-
- !! Determine the destination offset and branch
- !! to appropriate location
- and %o2, 3, %g4
- and %o2, 4, %g1
- or %g1, %g4, %g1
- movrnz %g4, 0, %g1
- movrnz %g1, 4, %g4
-
- !! %g4 contains the index of the jump address
- !! Load the address from the table.
- LABEL_ADDRESS(offset_table, %g1)
- sllx %g4, 2, %g4
- lduw [%g1 + %g4], %g4
- jmp %g1 + %g4
- add %o2, 8, %o2 ! src += 8, dst += 8
-
-.storexword:
- stx %o1, [%o2 - 8] ! store word to dst (address pre-incremented)
-
-1:
- ldx [%o2 + %o3], %o1 ! src dword
- add %o2, 8, %o2 ! src += 8, dst += 8
- andn %o5, %o1, %g1 ! ~dword & 0x8080808080808080
- sub %o1, %o4, %g4 ! dword - 0x0101010101010101
- andcc %g4, %g1, %g0 ! ((dword - 0x0101010101010101) & ~dword & 0x8080808080808080)
- bz,a,pt %xcc, 1b ! no zero byte if magic expression == 0
- stx %o1, [%o2 - 8] ! store word to dst (address pre-incremented)
-
- ba,a .zerobyte
-
-.storebyte1421:
- !! Offset 3 or 7
- srlx %o1, 56, %g1 ! %g1<7:0> = first byte; word aligned now
- stb %g1, [%o2 - 8] ! store first byte
- srlx %o1, 24, %g1 ! %g1<31:0> = bytes 2, 3, 4, 5
- stw %g1, [%o2 - 7] ! store bytes 2, 3, 4, 5
- srlx %o1, 8, %g1 ! %g1<15:0> = bytes 6, 7
- sth %g1, [%o2 - 3] ! store bytes 6, 7
-
- stx %l0, [%sp + STACK_OFFSET + 0]
- and %o2, 7, %g1
- stx %l1, [%sp + STACK_OFFSET + 8]
- cmp %g1, 3
- stx %l2, [%sp + STACK_OFFSET + 16]
-
- move %ncc, 40, %l0
- move %ncc, 24, %l1
- move %ncc, -11, %l2
-
- movne %ncc, 8, %l0
- movne %ncc, 56, %l1
- movne %ncc, -15, %l2
-
- ba .dstaligned
- mov %o1, %g5
-
-.storebyte1241:
- !! Offset 1 or 5
- srlx %o1, 56, %g1 ! %g1<7:0> = first byte; word aligned now
- stb %g1, [%o2 - 8] ! store first byte
- srlx %o1, 40, %g1 ! %g1<15:0> = bytes 2, 3
- sth %g1, [%o2 - 7] ! store bytes 2, 3
- srlx %o1, 8, %g1 ! %g1<31:0> = bytes 4, 5, 6, 7
- stw %g1, [%o2 - 5] ! store bytes 4, 5, 6, 7
-
- stx %l0, [%sp + STACK_OFFSET + 0]
- and %o2, 7, %g1
- stx %l1, [%sp + STACK_OFFSET + 8]
- cmp %g1, 1
- stx %l2, [%sp + STACK_OFFSET + 16]
-
- move %ncc, 56, %l0
- move %ncc, 8, %l1
- move %ncc, -9, %l2
-
- movne %ncc, 24, %l0
- movne %ncc, 40, %l1
- movne %ncc, -13, %l2
-
- ba .dstaligned
- mov %o1, %g5
-
-.storehalfword:
- srlx %o1, 48, %g1 ! get first and second byte
- sth %g1, [%o2 - 8] ! store first and second byte; word aligned now
- srlx %o1, 16, %g1 ! %g1<31:0> = bytes 3, 4, 5, 6
- stw %g1, [%o2 - 6] ! store bytes 3, 4, 5, 6
-
- stx %l0, [%sp + STACK_OFFSET + 0]
- and %o2, 7, %g1
- stx %l1, [%sp + STACK_OFFSET + 8]
- cmp %g1, 2
- stx %l2, [%sp + STACK_OFFSET + 16]
-
- move %ncc, 48, %l0
- move %ncc, 16, %l1
- move %ncc, -10, %l2
-
- movne %ncc, 16, %l0
- movne %ncc, 48, %l1
- movne %ncc, -14, %l2
-
- ba .dstaligned
- mov %o1, %g5
-
-.storeword:
- srlx %o1, 32, %g1 ! get bytes 1,2,3,4
- stw %g1, [%o2 - 8] ! store bytes 1,2,3,4 (address is pre-incremented)
-
- stx %l0, [%sp + STACK_OFFSET + 0]
- mov 32, %l0 ! Num of bits to be shifted left
- stx %l1, [%sp + STACK_OFFSET + 8]
- mov 32, %l1 ! Num of bits to be shifted right
- stx %l2, [%sp + STACK_OFFSET + 16]
- mov -12, %l2 ! -offset
- mov %o1, %g5
-
- nop ! Do not delete. Used for alignment.
-.dstaligned:
- ldx [%o2 + %o3], %o1 ! x = src[]
- add %o2, 8, %o2 ! src += 8, dst += 8
- andn %o5, %o1, %g1 ! ~x & 0x8080808080808080
- sub %o1, %o4, %g4 ! x - 0x0101010101010101
- andcc %g4, %g1, %g0 ! ((x - 0x0101010101010101) & ~x & 0x8080808080808080)
- bnz,a,pn %xcc, .finishup ! x has zero byte, handle end cases
- stb %g5, [%o2 - 9]
-
- sllx %g5, %l0, %g5
- srlx %o1, %l1, %g4
- or %g5, %g4, %g5
-
- stx %g5, [%o2 + %l2]
- ba .dstaligned
- mov %o1, %g5
-
-.finishup:
- cmp %l0, 56
- be,pn %ncc, .zerobyte_restore
- andcc %o2, 1, %g0
- bnz,a %ncc, 1f
- srlx %g5, 8, %g5
-
-1: srlx %l1, 4, %g4 ! g4 contains 1, 2 or 3
- sub %g4, 1, %g4 ! multiple of 16
- sllx %g4, 4, %g4 ! How many bits to shift
- srlx %g5, %g4, %l0
- add %o2, %l2, %g1
-
-2: sth %l0, [%g1]
- sub %g4, 16, %g4
- add %g1, 2, %g1
- brgez,a,pt %g4, 2b
- srlx %g5, %g4, %l0
-
-.zerobyte_restore:
- ldx [%sp + STACK_OFFSET + 0], %l0
- andn %o5, %o1, %o3 ! ~val & 0x80
- ldx [%sp + STACK_OFFSET + 8], %l1
- sub %o1, %o4, %g1 ! val - 0x01
- ldx [%sp + STACK_OFFSET + 16], %l2
-
- ba 1f
- andcc %o3, %g1, %o3 ! ~val & 0x80 & (val - 0x01)
-
-.zerobyte:
- !! %o5: 0x8080808080808080
- !! %o4: 0x0101010101010101
- !! %o1: Left justified dowrd that contains 0 byte
- !! %o2: Address to be written + 8
-
- andn %o5, %o1, %o3 ! ~val & 0x80
- sub %o1, %o4, %g1 ! val - 0x01
- andcc %o3, %g1, %o3 ! ~val & 0x80 & (val - 0x01)
-
-1: srlx %o3, 7, %o3 ! shift 0x80 -> 0x01
- andn %o3, %o1, %o3 ! mask off leading 0x01 bytes
- lzd %o3, %o4 ! 7, 15, ... 63
-
- mov 64, %o5 ! Calc # of bytes to be discarded
- inc %o4 ! Include the zero byte too
- sub %o5, %o4, %o5 ! after the null byte
- sub %o2, 8, %o2 ! Adjust address which is +8 here.
- srlx %o1, %o5, %o1 ! Discard them
-
- srlx %o4, 3, %o4 ! Bits to bytes to be written
- dec %o4 ! dec 1 to use it as offset
-
-2: stub %o1, [%o2 + %o4]
- dec %o4
- brgez,pt %o4, 2b
- srlx %o1, 8, %o1
-
-#ifndef __sparcv9
- ldx [%sp + STACK_OFFSET + 24], %g4
- ldx [%sp + STACK_OFFSET + 32], %g5
-#endif
- retl ! done with leaf function
- add %sp, SA(STACK_OFFSET + SAVESIZE), %sp
- SET_SIZE(strcpy)
diff --git a/usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s b/usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s
deleted file mode 100644
index d2683ef381..0000000000
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
- .file "strlen.s"
-
-/*
- * strlen(s)
- *
- * Given string s, return length (not including the terminating null).
- *
- * Fast assembler language version of the following C-program strlen
- * which represents the `standard' for the C-library.
- *
- * size_t
- * strlen(s)
- * register const char *s;
- * {
- * register const char *s0 = s + 1;
- *
- * while (*s++ != '\0')
- * ;
- * return (s - s0);
- * }
- */
-
-#include <sys/asm_linkage.h>
-
- /*
- * There are two key optimizations in the routine below.
- * First, all memory accesses are 8 bytes wide. The time
- * for long strings is dominated by the latency of load
- * instructions in the inner loop, and going 8 bytes at
- * a time means 1/8th as much latency.
- *
- * Scanning an 8 byte word for a '\0' is made fast by
- * this formula (due to Alan Mycroft):
- * ~x & 0x808080808080 & (x - 0x0101010101010101)
- * The result of this formula is non-zero iff there's
- * a '\0' somewhere in x.
- *
- * Second, the cost of short strings is dominated by the
- * cost of figuring out which byte out of the last 8
- * contained the '\0' that terminated the string. We use
- * properties of the formula above to convert scanning the
- * word for '\0' into a single LZD instruction.
- */
- .align 64
- .skip 4*4 ! force .findnull to align to 64 bytes
- ENTRY_NP(strlen)
- and %o0, 7, %o3 ! off = addr & 7
- sethi %hi(0x01010101), %o4 ! 0x01010000
-
- sub %g0, %o3, %o2 ! count = -off
- or %o4, %lo(0x01010101), %o4 ! 0x01010101
-
- ldx [%o0 + %o2], %o1 ! val = *(addr + count)
- sllx %o4, 32, %o5 ! 0x01010101 << 32
-
- mov -1, %g1 ! mask = -1
- sllx %o3, 3, %o3 ! shift = off * 8
-
- or %o4, %o5, %o4 ! 0x0101010101010101
- srlx %g1, %o3, %g1 ! -1 >> ((addr & 7) * 8)
-
- sllx %o4, 7, %o5 ! 0x8080808080808080
- orn %o1, %g1, %o1 ! val |= ~mask
-.strlen_findnull:
- !! %o0 - base address
- !! %o1 - xword from memory
- !! %o2 - index
- !! %o3 - result of test for '\0'
- !! %o4 - constant 0x0101.0101.0101.0101
- !! %o5 - constant 0x8080.8080.8080.8080
- !! %g1 - scratch
- andn %o5, %o1, %o3 ! ~val & 0x80
- sub %o1, %o4, %g1 ! val - 0x01
- andcc %o3, %g1, %o3 ! ~val & 0x80 & (val - 0x01)
- inc 8, %o2
- bz,a,pt %xcc, .strlen_findnull
- ldx [%o0 + %o2], %o1
-
- /*
- * The result of Mycroft's formula is a pattern of 0x80 and
- * 0x00 bytes. There's a 0x80 at every byte position where
- * there was a '\0' character, but a string of 0x01 bytes
- * immediately preceding a '\0' becomes a corresponding
- * string of 0x80 bytes. (e.g. 0x0101010101010100 becomes
- * 0x8080808080808080). We need one final step to discount
- * any leading 0x01 bytes, and then LZD can tell us how many
- * characters there were before the terminating '\0'.
- */
- !! %o1 - last data word
- !! %o2 - length+8, plus 1-8 extra
- !! %o3 - xword with 0x80 for each 0x00 byte and leading 0x01
- sub %o2, 8, %o2 ! subtract off '\0' and last 8
- srlx %o3, 7, %o3 ! shift 0x80 -> 0x01
- andn %o3, %o1, %o3 ! mask off leading 0x01 bytes
- lzd %o3, %o3 ! 7, 15, ... 63
- srlx %o3, 3, %o3 ! 0 ... 7
-
- retl
- add %o2, %o3, %o0 ! add back bytes before '\0'
-
- SET_SIZE(strlen)
diff --git a/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile b/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile
index 3a299a35e0..d648203adc 100644
--- a/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile
+++ b/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile
@@ -28,8 +28,7 @@ LIBCBASE= $(SRC)/lib/libc/sparc
LIBRARY= libc_hwcap1.a
-EXTN_CPPFLAGS= -DSMT_PAUSE_FUNCTION=_rock_pause \
- -I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
+EXTN_CPPFLAGS= -I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
EXTN_ASFLAGS= -xarch=v8plusd
EXTN_DYNFLAGS= -M mapfile
@@ -40,10 +39,10 @@ OPTIMIZED_LIBCBASE=../common
PRFOBJS= \
memcpy.o \
+ memmove.o \
memset.o \
strlen.o \
strcpy.o \
- misc.o
MAPFILE_AUX = mapfile-vers-aux
diff --git a/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile b/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile
index 7065a134f2..451d682145 100644
--- a/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile
+++ b/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile
@@ -27,8 +27,7 @@ LIBCBASE= $(SRC)/lib/libc/sparcv9
LIBRARY= libc_hwcap1.a
-EXTN_CPPFLAGS= -DSMT_PAUSE_FUNCTION=_rock_pause \
- -I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
+EXTN_CPPFLAGS= -I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
EXTN_ASFLAGS= -xarch=v9d
EXTN_DYNFLAGS= -M mapfile
@@ -39,10 +38,10 @@ OPTIMIZED_LIBCBASE=../common
PRFOBJS= \
memcpy.o \
+ memmove.o \
memset.o \
strlen.o \
strcpy.o \
- misc.o
MAPFILE_AUX = mapfile-vers-aux