summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc_psr
diff options
context:
space:
mode:
authorhyw <none@none>2006-04-21 13:06:49 -0700
committerhyw <none@none>2006-04-21 13:06:49 -0700
commite4896ad21c7454d623b59dc9dc6ecbe7ca47941a (patch)
treeb8eb14a96ec8dde7c2024a3602ce861d24a69dae /usr/src/lib/libc_psr
parent33355266d8b3ad9ad5d65cc1269f1dabef54bb30 (diff)
downloadillumos-joyent-e4896ad21c7454d623b59dc9dc6ecbe7ca47941a.tar.gz
6416186 Additional performance tunings for OPL (prefetch params & libc_psr)
Diffstat (limited to 'usr/src/lib/libc_psr')
-rw-r--r--usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s536
1 files changed, 422 insertions, 114 deletions
diff --git a/usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s b/usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s
index cf41195e78..0fe2cbf6af 100644
--- a/usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s
+++ b/usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s
@@ -26,6 +26,7 @@
.ident "%Z%%M% %I% %E% SMI"
.file "memcpy.s"
+
/*
* memcpy(s1, s2, len)
*
@@ -36,7 +37,7 @@
* Fast assembler language version of the following C-program for memcpy
* which represents the `standard' for the C-library.
*
- * void *
+ * void *
* memcpy(void *s, const void *s0, size_t n)
* {
* if (n != 0) {
@@ -54,22 +55,32 @@
#include <sys/sun4asi.h>
#include <sys/trap.h>
-#define ICACHE_LINE_SIZE 32
-#define BLOCK_SIZE 64
-#define FPRS_FEF 0x4
+#define ICACHE_LINE_SIZE 64
+#define BLOCK_SIZE 64
+#define FPRS_FEF 0x4
+
+#define ALIGNED8_FPCOPY_THRESHOLD 1024
+#define ALIGNED4_FPCOPY_THRESHOLD 1024
+#define BST_THRESHOLD 65536
-#define SHORTCOPY 3
+#define SHORTCOPY 3
#define SMALL_MAX 39
#define MEDIUM_MAX 255
-#define MED_WMAX 256 /* max copy for medium word-aligned case */
-#define MED_MAX 65536 /* max copy for medium longword-aligned case */
+#define MED_WMAX 256 /* max copy for medium word-aligned case */
+
+#define N_READS_STRONG 20
+#define N_WRITES_STRONG 22
+
ANSI_PRAGMA_WEAK(memmove,function)
ANSI_PRAGMA_WEAK(memcpy,function)
#include "synonyms.h"
+
ENTRY(memmove)
+ prefetch [%o1], N_READS_STRONG
+ prefetch [%o0], N_WRITES_STRONG
cmp %o1, %o0 ! if from address is >= to use forward copy
bgeu %ncc, .forcpy ! else use backward if ...
sub %o0, %o1, %o4 ! get difference of two addresses
@@ -77,70 +88,245 @@
bleu %ncc, .forcpy ! if size is bigger, do overlapped copy
nop
- !
- ! an overlapped copy that must be done "backwards"
- !
-.ovbc:
- mov %o0, %g1 ! save dest address for return val
+ !
+ ! an overlapped copy that must be done "backwards"
+ !
+.ovbc:
+ mov %o0, %g1 ! save dest address for return val
add %o1, %o2, %o1 ! get to end of source space
- add %o0, %o2, %o0 ! get to end of destination space
+ add %o0, %o2, %o0 ! get to end of destination space
-.chksize:
- cmp %o2, 8
+ cmp %o2, 64
bgeu,pn %ncc, .dbalign
- andcc %o0, 7, %g0 ! Is DST 8 byte aligned?
-
+ nop
+ cmp %o2, 4
+ blt,pn %ncc, .byte
+ sub %o2, 3, %o2
+.byte4loop:
+ ldub [%o1-1], %o3 ! load last byte
+ stb %o3, [%o0-1] ! store last byte
+ sub %o1, 4, %o1
+ ldub [%o1+2], %o3 ! load 2nd from last byte
+ stb %o3, [%o0-2] ! store 2nd from last byte
+ sub %o0, 4, %o0
+ ldub [%o1+1], %o3 ! load 3rd from last byte
+ stb %o3, [%o0+1] ! store 3rd from last byte
+ subcc %o2, 4, %o2
+ ldub [%o1], %o3 ! load 4th from last byte
+ bgu,pt %ncc, .byte4loop
+ stb %o3, [%o0] ! store 4th from last byte
.byte:
-1: deccc %o2 ! decrement count
- blu,pn %ncc, .exit ! loop until done
- dec %o0 ! decrement to address
- dec %o1 ! decrement from address
- ldub [%o1], %o3 ! read a byte
- ba 1b ! loop until done
+ addcc %o2, 3, %o2
+ bz,pt %ncc, .exit
+.byteloop:
+ dec %o1 ! decrement src address
+ ldub [%o1], %o3 ! read a byte
+ dec %o0 ! decrement dst address
+ deccc %o2 ! decrement count
+ bgu,pt %ncc, .byteloop ! loop until done
stb %o3, [%o0] ! write byte
+.exit:
+ retl
+ mov %g1, %o0
+ .align 16
.dbalign:
- bz %ncc, .dbbck
+ prefetch [%o1 - (4 * BLOCK_SIZE)], #one_read
+ prefetch [%o0 - (4 * BLOCK_SIZE)], #one_write
+ andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned
+ bz,pt %ncc, .dbmed
+ sub %o2, %o5, %o2 ! update count
+.dbalign1:
+ dec %o1 ! decrement src address
+ ldub [%o1], %o3 ! read a byte
+ dec %o0 ! decrement dst address
+ deccc %o5 ! decrement count
+ bgu,pt %ncc, .dbalign1 ! loop until done
+ stb %o3, [%o0] ! store a byte
+
+! check for src long word alignment
+.dbmed:
+ andcc %o1, 7, %g0 ! chk src long word alignment
+ bnz,pn %ncc, .dbbck
nop
- dec %o1
- dec %o0
- dec %o2
- ldub [%o1], %o3
- ba .chksize
- stb %o3, [%o0]
+!
+! Following code is for overlapping copies where src and dest
+! are long word aligned
+!
+!
+! For SPARC64-VI, prefetch is effective for both integer and fp register
+! operations. There are no benefits in using the fp registers for
+! aligned data copying.
+
+.dbmedl32enter:
+ subcc %o2, 31, %o2 ! adjust length to allow cc test
+ ! for end of loop
+ ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32
+ nop
+.dbmedl32:
+ ldx [%o1-8], %o4 ! load
+ prefetch [%o1 - (8 * BLOCK_SIZE)], #one_read
+ subcc %o2, 32, %o2 ! decrement length count
+ stx %o4, [%o0-8] ! and store
+ prefetch [%o0 - (8 * BLOCK_SIZE)], #one_write
+ ldx [%o1-16], %o3 ! a block of 32 bytes
+ sub %o1, 32, %o1 ! decrease src ptr by 32
+ stx %o3, [%o0-16]
+ ldx [%o1+8], %o4
+ sub %o0, 32, %o0 ! decrease dst ptr by 32
+ stx %o4, [%o0+8]
+ ldx [%o1], %o3
+ bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left
+ stx %o3, [%o0]
+.dbmedl31:
+ addcc %o2, 16, %o2 ! adjust remaining count
+ ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left
+ nop !
+ ldx [%o1-8], %o4 ! load and store 16 bytes
+ sub %o1, 16, %o1 ! decrease src ptr by 16
+ stx %o4, [%o0-8] !
+ sub %o2, 16, %o2 ! decrease count by 16
+ ldx [%o1], %o3 !
+ sub %o0, 16, %o0 ! decrease dst ptr by 16
+ stx %o3, [%o0]
+.dbmedl15:
+ addcc %o2, 15, %o2 ! restore count
+ bz,pt %ncc, .dbexit ! exit if finished
+ nop
+ cmp %o2, 8
+ blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left
+ nop
+ ldx [%o1-8], %o4 ! load 8 bytes
+ sub %o1, 8, %o1 ! decrease src ptr by 8
+ stx %o4, [%o0-8] ! and store 8 bytes
+ subcc %o2, 8, %o2 ! decrease count by 8
+ bnz %ncc, .dbremain ! exit if finished
+ sub %o0, 8, %o0 ! decrease dst ptr by 8
+ retl
+ mov %g1, %o0
+!
+! Following code is for overlapping copies where src and dest
+! are not long word aligned
+!
+ .align 16
.dbbck:
- rd %fprs, %o3 ! o3 = fprs
-
- ! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
- ! So set it anyway, without checking.
- wr %g0, 0x4, %fprs ! fprs.fef = 1
-
- alignaddr %o1, %g0, %o5 ! align src
- ldd [%o5], %d0 ! get first 8 byte block
- sub %o5, 8, %o5
- andn %o2, 7, %o4
- sub %o1, %o4, %o1
-
-2:
+ rd %fprs, %o3 ! o3 = fprs
+
+ ! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
+ ! So set it anyway, without checking.
+ wr %g0, 0x4, %fprs ! fprs.fef = 1
+
+ alignaddr %o1, %g0, %o5 ! align src
+ ldd [%o5], %d0 ! get first 8 byte block
+ andn %o2, 7, %o4 ! prepare src ptr for finishup code
+ cmp %o2, 32
+ blt,pn %ncc, .dbmv8
+ sub %o1, %o4, %o1 !
+ cmp %o2, 4095 ! check for short memmoves
+ blt,pn %ncc, .dbmv32enter ! go to no prefetch code
+.dbmv64:
+ ldd [%o5-8], %d2 ! load 8 bytes
+ ldd [%o5-16], %d4 ! load 8 bytes
+ sub %o5, 64, %o5 !
+ ldd [%o5+40], %d6 ! load 8 bytes
+ sub %o0, 64, %o0 !
+ ldd [%o5+32], %d8 ! load 8 bytes
+ sub %o2, 64, %o2 ! 64 less bytes to copy
+ ldd [%o5+24], %d18 ! load 8 bytes
+ cmp %o2, 64 ! do we have < 64 bytes remaining
+ ldd [%o5+16], %d28 ! load 8 bytes
+ ldd [%o5+8], %d30 ! load 8 bytes
+ faligndata %d2, %d0, %d10 ! extract 8 bytes out
+ prefetch [%o5 - (5 * BLOCK_SIZE)], #one_read
+ ldd [%o5], %d0 ! load 8 bytes
+ std %d10, [%o0+56] ! store the current 8 bytes
+ faligndata %d4, %d2, %d12 ! extract 8 bytes out
+ prefetch [%o0 - (5 * BLOCK_SIZE)], #one_write
+ std %d12, [%o0+48] ! store the current 8 bytes
+ faligndata %d6, %d4, %d14 ! extract 8 bytes out
+ std %d14, [%o0+40] ! store the current 8 bytes
+ faligndata %d8, %d6, %d16 ! extract 8 bytes out
+ std %d16, [%o0+32] ! store the current 8 bytes
+ faligndata %d18, %d8, %d20 ! extract 8 bytes out
+ std %d20, [%o0+24] ! store the current 8 bytes
+ faligndata %d28, %d18, %d22 ! extract 8 bytes out
+ std %d22, [%o0+16] ! store the current 8 bytes
+ faligndata %d30, %d28, %d24 ! extract 8 bytes out
+ std %d24, [%o0+8] ! store the current 8 bytes
+ faligndata %d0, %d30, %d26 ! extract 8 bytes out
+ bgeu,pt %ncc, .dbmv64
+ std %d26, [%o0] ! store the current 8 bytes
+
+ cmp %o2, 32
+ blt,pn %ncc, .dbmvx
+ nop
+.dbmv32:
+ ldd [%o5-8], %d2 ! load 8 bytes
+.dbmv32enter:
+ ldd [%o5-16], %d4 ! load 8 bytes
+ sub %o5, 32, %o5 !
+ ldd [%o5+8], %d6 ! load 8 bytes
+ sub %o0, 32, %o0 !
+ faligndata %d2, %d0, %d10 ! extract 8 bytes out
+ ldd [%o5], %d0 ! load 8 bytes
+ sub %o2,32, %o2 ! 32 less bytes to copy
+ std %d10, [%o0+24] ! store the current 8 bytes
+ cmp %o2, 32 ! do we have < 32 bytes remaining
+ faligndata %d4, %d2, %d12 ! extract 8 bytes out
+ std %d12, [%o0+16] ! store the current 8 bytes
+ faligndata %d6, %d4, %d14 ! extract 8 bytes out
+ std %d14, [%o0+8] ! store the current 8 bytes
+ faligndata %d0, %d6, %d16 ! extract 8 bytes out
+ bgeu,pt %ncc, .dbmv32
+ std %d16, [%o0] ! store the current 8 bytes
+.dbmvx:
+ cmp %o2, 8 ! do we have < 8 bytes remaining
+ blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code
+ nop
+.dbmv8:
+ ldd [%o5-8], %d2
sub %o0, 8, %o0 ! since we are at the end
! when we first enter the loop
- ldd [%o5], %d2
- sub %o2, 8, %o2 ! 8 less bytes to copy
+ sub %o2, 8, %o2 ! 8 less bytes to copy
sub %o5, 8, %o5
cmp %o2, 8 ! do we have < 8 bytes remaining
- faligndata %d2, %d0, %d8 ! extract 8 bytes out
- std %d8, [%o0] ! store the current 8 bytes
- bgeu,pt %ncc, 2b
+ faligndata %d2, %d0, %d8 ! extract 8 bytes out
+ std %d8, [%o0] ! store the current 8 bytes
+ bgeu,pt %ncc, .dbmv8
fmovd %d2, %d0
+.dbmvfinish:
+ and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0
+ tst %o2
+ bz,pt %ncc, .dbexit
+ wr %o3, %g0, %fprs ! fprs = o3 restore fprs
- and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0
- ba .byte
- wr %o3, %g0, %fprs ! fprs = o3 restore fprs
-
-.exit:
+.dbremain:
+ cmp %o2, 4
+ blt,pn %ncc, .dbbyte
+ nop
+ ldub [%o1-1], %o3 ! load last byte
+ stb %o3, [%o0-1] ! store last byte
+ sub %o1, 4, %o1
+ ldub [%o1+2], %o3 ! load 2nd from last byte
+ stb %o3, [%o0-2] ! store 2nd from last byte
+ sub %o0, 4, %o0
+ ldub [%o1+1], %o3 ! load 3rd from last byte
+ stb %o3, [%o0+1] ! store 3rd from last byte
+ subcc %o2, 4, %o2
+ ldub [%o1], %o3 ! load 4th from last byte
+ stb %o3, [%o0] ! store 4th from last byte
+ bz,pt %ncc, .dbexit
+.dbbyte:
+ dec %o1 ! decrement src address
+ ldub [%o1], %o3 ! read a byte
+ dec %o0 ! decrement dst address
+ deccc %o2 ! decrement count
+ bgu,pt %ncc, .dbbyte ! loop until done
+ stb %o3, [%o0] ! write byte
+.dbexit:
retl
- mov %g1, %o0
+ mov %g1, %o0
SET_SIZE(memmove)
@@ -150,8 +336,8 @@
nop ! Do not remove, these nops affect
nop ! icache alignment and performance
.forcpy:
- prefetch [%o1], #n_reads
- prefetch [%o0], #n_reads
+ prefetch [%o1], N_READS_STRONG
+ prefetch [%o0], N_WRITES_STRONG
cmp %o2, SMALL_MAX ! check for not small case
bgu,pn %ncc, .medium ! go to larger cases
mov %o0, %g1 ! save %o0
@@ -193,7 +379,7 @@
retl
mov %g1, %o0 ! restore %o0
- .align 8
+ .align 16
nop ! affects loop icache alignment
.smallwords:
lduw [%o1], %o3 ! read word
@@ -243,11 +429,13 @@
mov %g1, %o0 ! restore %o0
.align 16
.medium:
+ prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
+ prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write
neg %o0, %o5
- neg %o1, %o3
+ neg %o1, %o3
andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
and %o3, 7, %o3 ! bytes till SRC 8 byte aligned
-
+
bz %ncc, 2f
sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned)
! o3={-7, -6, ... 7} o3>0 => SRC overaligned
@@ -266,10 +454,10 @@
2:
andcc %o1, 0x3, %g0 ! test alignment
- prefetch [%o1 + (1 * BLOCK_SIZE)], #n_reads
+ prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases
! if src, dst not aligned
- prefetch [%o0 + (1 * BLOCK_SIZE)], #n_reads
+ prefetch [%o0 + (1 * BLOCK_SIZE)], #one_write
/*
* Handle all cases where src and dest are aligned on word
@@ -279,23 +467,22 @@
* to short data moves.
*/
andcc %o1, 0x7, %g0 ! test word alignment
- prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads
+ prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
bz,pt %ncc, .medlword ! branch to long word aligned case
- prefetch [%o0 + (2 * BLOCK_SIZE)], #n_reads
-
- cmp %o2, MED_WMAX ! limit to store buffer size
+ prefetch [%o1 + (2 * BLOCK_SIZE)], #one_write
+ cmp %o2, ALIGNED4_FPCOPY_THRESHOLD ! limit to store buffer size
bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
- prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads
+ prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
subcc %o2, 15, %o2 ! adjust length to allow cc test
- prefetch [%o0 + (3 * BLOCK_SIZE)], #n_reads
+ prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write
! for end of loop
ble,pt %ncc, .medw15 ! skip big loop if less than 16
.empty
.medw16:
- prefetch [%o1 + (4 * BLOCK_SIZE)], #n_reads
+ prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ld [%o1], %o4 ! load
subcc %o2, 16, %o2 ! decrement length count
- prefetch [%o0 + (4 * BLOCK_SIZE)], #n_reads
+ prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write
stw %o4, [%o0] ! and store
ld [%o1+4], %o3 ! a block of 16 bytes
add %o1, 16, %o1 ! increase src ptr by 16
@@ -348,30 +535,30 @@
.medwexit:
retl
mov %g1, %o0 ! restore %o0
-
+
/*
* Special case for handling when src and dest are both long word aligned
- * and total data to move is between SMALL_MAX and MED_MAX bytes
+ * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD
+ * bytes.
*/
.align 16
nop
.medlword: ! long word aligned
- ! length > SMALL_MAX
- set MED_MAX, %o4
- cmp %o2, %o4
+ ! length > ALIGNED8_FPCOPY_THRESHOLD
+ cmp %o2, ALIGNED8_FPCOPY_THRESHOLD
bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop
- prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads
- prefetch [%o0 + (3 * BLOCK_SIZE)], #n_reads
+ prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
+ prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write
subcc %o2, 31, %o2 ! adjust length to allow cc test
! for end of loop
ble,pt %ncc, .medl31 ! skip big loop if less than 32
.empty
.medl32:
- prefetch [%o1 + (4 * BLOCK_SIZE)], #n_reads
+ prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
ldx [%o1], %o4 ! load
subcc %o2, 32, %o2 ! decrement length count
- prefetch [%o0 + (4 * BLOCK_SIZE)], #n_reads
+ prefetch [%o0 + (4 * BLOCK_SIZE)], #one_read
stx %o4, [%o0] ! and store
ldx [%o1+8], %o3 ! a block of 32 bytes
add %o1, 32, %o1 ! increase src ptr by 32
@@ -409,25 +596,26 @@
ba .medw7
nop
- .align 8
+ .align 16
+ nop
+ nop
+ nop
.mediumsetup:
prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
.mediumrejoin:
rd %fprs, %o4 ! check for unused FPU
-
+
add %o1, 8, %o1 ! prepare to round SRC upward
- sethi %hi(0x1234567f), %o5 ! For GSR.MASK
+ sethi %hi(0x1234567f), %o5 ! For GSR.MASK
or %o5, 0x67f, %o5
andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0
- prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
bz,a %ncc, 3f
wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
3:
cmp %o2, MEDIUM_MAX
- prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
bmask %o5, %g0, %g0
! Compute o5 (number of bytes that need copying using the main loop).
@@ -466,8 +654,7 @@
andn %o5, 7, %o5 ! 8 byte aligned count
neg %o0, %o5 ! 'large' case
and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned
-4:
- prefetch [%o1 + (6 * BLOCK_SIZE)], #one_read
+4:
brgez,a %o3, .beginmedloop
ldd [%o1-8], %d0
@@ -479,14 +666,13 @@
bnz %ncc, 5b
bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
-.beginmedloop:
- prefetch [%o1 + (7 * BLOCK_SIZE)], #one_read
+.beginmedloop:
tst %o5
bz %ncc, .endmedloop
sub %o2, %o5, %o2 ! update count for later
! Main loop to write out doubles. Note: o5 & 7 == 0
-
+
ldx [%o1], %d2
subcc %o5, 8, %o5 ! update local count
bz,pn %ncc, 1f
@@ -506,28 +692,27 @@
bnz,pt %ncc, .medloop
add %o0, 16, %o0 ! update DST
-1:
+1:
faligndata %d0, %d2, %d4
fmovd %d2, %d0
std %d4, [%o0]
ba .endmedloop
add %o0, 8, %o0
-
+
2:
std %d6, [%o0 + 8]
sub %o1, 8, %o1
add %o0, 16, %o0
-
+
.endmedloop:
! Currently, o1 is pointing to the next double-aligned byte in SRC
! The 8 bytes starting at [o1-8] are available in d0
! At least one, and possibly all, of these need to be written.
- prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read
- cmp %o2, BLOCK_SIZE
+ cmp %o2, BLOCK_SIZE
bgu %ncc, .large ! otherwise, less than 16 bytes left
-
+
#if 0
/* This code will use partial stores. */
@@ -537,13 +722,13 @@
! fill %d0 with good (unwritten) data.
subcc %o2, 8, %o2 ! update count (maybe too much)
- movl %ncc, %o2, %o5
+ movl %ncc, %o2, %o5
addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0
sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0)
bz %ncc, 2f
alignaddr %o3, %g0, %g0 ! set GSR.ALIGN
-
+
1:
deccc %o5
ldda [%o1]ASI_FL8_P, %d2
@@ -552,14 +737,14 @@
bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
2:
- not %o3
+ not %o3
faligndata %d0, %d0, %d0 ! shift bytes to the left
and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3]
edge8n %g0, %o3, %o5
stda %d0, [%o0]%o5, ASI_PST8_P
- brlez %o2, .mediumexit
+ brlez %o2, .mediumexit
add %o0, %o3, %o0 ! update DST to last stored byte
-3:
+3:
inc %o0
deccc %o2
ldub [%o1], %o3
@@ -584,27 +769,27 @@
bgu %ncc, 1b
bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2
-2:
+2:
subcc %o2, 8, %o2
std %d0, [%o0]
bz %ncc, .mediumexit
add %o0, 8, %o0
-3:
+3:
ldub [%o1], %o3
deccc %o2
inc %o1
stb %o3, [%o0]
bgu %ncc, 3b
inc %o0
-#endif
+#endif
.mediumexit:
- wr %o4, %g0, %fprs ! fprs = o4 restore fprs
+ wr %o4, %g0, %fprs ! fprs = o4 restore fprs
retl
- mov %g1, %o0
+ mov %g1, %o0
- .align 8
+ .align ICACHE_LINE_SIZE
.large:
! %o0 I/O DST is 64-byte aligned
@@ -615,10 +800,131 @@
! %o4 I Not written. Holds fprs.
! %o5 O The number of doubles that remain to be written.
- ! Load the rest of the current block
+ ! Load the rest of the current block
+ ! Recall that %o1 is further into SRC than %o0 is into DST
+
+ prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
+ prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read
+
+ set BST_THRESHOLD, %o5
+ cmp %o2, %o5
+ bgu,pn %icc, .xlarge
+ prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read
+
+ ldd [%o1], %f2
+ ldd [%o1 + 0x8], %f4
+ faligndata %f0, %f2, %f32
+ ldd [%o1 + 0x10], %f6
+ faligndata %f2, %f4, %f34
+ ldd [%o1 + 0x18], %f8
+ faligndata %f4, %f6, %f36
+ ldd [%o1 + 0x20], %f10
+ or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
+ faligndata %f6, %f8, %f38
+ prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
+ ldd [%o1 + 0x28], %f12
+ movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter)
+ faligndata %f8, %f10, %f40
+ ldd [%o1 + 0x30], %f14
+ faligndata %f10, %f12, %f42
+ ldd [%o1 + 0x38], %f0
+ sub %o2, BLOCK_SIZE, %o2 ! update count
+ add %o1, BLOCK_SIZE, %o1 ! update SRC
+
+ ! Main loop. Write previous block. Load rest of current block.
+ ! Some bytes will be loaded that won't yet be written.
+1:
+ ldd [%o1], %f2
+ faligndata %f12, %f14, %f44
+ ldd [%o1 + 0x8], %f4
+ faligndata %f14, %f0, %f46
+ std %f32, [%o0]
+ std %f34, [%o0+8]
+ std %f36, [%o0+16]
+ std %f38, [%o0+24]
+ std %f40, [%o0+32]
+ std %f42, [%o0+40]
+ std %f44, [%o0+48]
+ std %f46, [%o0+56]
+ sub %o2, BLOCK_SIZE, %o2 ! update count
+ prefetch [%o1 + (24 * BLOCK_SIZE) + BLOCK_SIZE], #one_read
+ add %o0, BLOCK_SIZE, %o0 ! update DST
+ ldd [%o1 + 0x10], %f6
+ faligndata %f0, %f2, %f32
+ ldd [%o1 + 0x18], %f8
+ faligndata %f2, %f4, %f34
+ ldd [%o1 + 0x20], %f10
+ faligndata %f4, %f6, %f36
+ ldd [%o1 + 0x28], %f12
+ faligndata %f6, %f8, %f38
+ ldd [%o1 + 0x30], %f14
+ faligndata %f8, %f10, %f40
+ ldd [%o1 + 0x38], %f0
+ faligndata %f10, %f12, %f42
+ prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read
+ cmp %o2, BLOCK_SIZE + 8
+ prefetch [%o0 + (18 * BLOCK_SIZE)], #one_write
+ bgu,pt %ncc, 1b
+ add %o1, BLOCK_SIZE, %o1 ! update SRC
+ faligndata %f12, %f14, %f44
+ faligndata %f14, %f0, %f46
+ stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
+ cmp %o2, BLOCK_SIZE
+ bne %ncc, 2f ! exactly 1 block remaining?
+ add %o0, BLOCK_SIZE, %o0 ! update DST
+ brz,a %o3, 3f ! is SRC double aligned?
+ ldd [%o1], %f2
+
+2:
+ add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
+ add %o5, %o3, %o5
+
+ membar #StoreLoad|#StoreStore
+
+ ba .beginmedloop
+ andn %o5, 7, %o5 ! 8 byte aligned count
+
+
+ ! This is when there is exactly 1 block remaining and SRC is aligned
+3:
+ ldd [%o1 + 0x8], %f4
+ ldd [%o1 + 0x10], %f6
+ fsrc1 %f0, %f32
+ ldd [%o1 + 0x18], %f8
+ fsrc1 %f2, %f34
+ ldd [%o1 + 0x20], %f10
+ fsrc1 %f4, %f36
+ ldd [%o1 + 0x28], %f12
+ fsrc1 %f6, %f38
+ ldd [%o1 + 0x30], %f14
+ fsrc1 %f8, %f40
+ fsrc1 %f10, %f42
+ fsrc1 %f12, %f44
+ fsrc1 %f14, %f46
+ stda %f32, [%o0]ASI_BLK_P
+ membar #StoreLoad|#StoreStore
+ wr %o4, 0, %fprs
+ retl
+ mov %g1, %o0
+
+
+ .align 16
+ ! two nops here causes loop starting at 1f below to be
+ ! on a cache line boundary, improving performance
+ nop
+ nop
+.xlarge:
+ ! %o0 I/O DST is 64-byte aligned
+ ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
+ ! %d0 I/O already loaded with SRC data from [%o1-8]
+ ! %o2 I/O count (number of bytes that need to be written)
+ ! %o3 I Not written. If zero, then SRC is double aligned.
+ ! %o4 I Not written. Holds fprs.
+ ! %o5 O The number of doubles that remain to be written.
+
+ ! Load the rest of the current block
! Recall that %o1 is further into SRC than %o0 is into DST
- prefetch [%o1 + (9 * BLOCK_SIZE)], #one_read
ldd [%o1], %f2
ldd [%o1 + 0x8], %f4
faligndata %f0, %f2, %f32
@@ -627,20 +933,22 @@
ldd [%o1 + 0x18], %f8
faligndata %f4, %f6, %f36
ldd [%o1 + 0x20], %f10
- or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
+ or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8
faligndata %f6, %f8, %f38
ldd [%o1 + 0x28], %f12
movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later)
+ prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
faligndata %f8, %f10, %f40
ldd [%o1 + 0x30], %f14
faligndata %f10, %f12, %f42
ldd [%o1 + 0x38], %f0
+ prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read
sub %o2, BLOCK_SIZE, %o2 ! update count
add %o1, BLOCK_SIZE, %o1 ! update SRC
! This point is 32-byte aligned since 24 instructions appear since
! the previous alignment directive.
-
+
! Main loop. Write previous block. Load rest of current block.
! Some bytes will be loaded that won't yet be written.
@@ -664,25 +972,25 @@
faligndata %f8, %f10, %f40
ldd [%o1 + 0x38], %f0
faligndata %f10, %f12, %f42
- prefetch [%o1 + (10 * BLOCK_SIZE)], #one_read
+ prefetch [%o1 + (25 * BLOCK_SIZE)], #one_read
add %o0, BLOCK_SIZE, %o0 ! update DST
cmp %o2, BLOCK_SIZE + 8
! second prefetch important to correct for occasional dropped
- prefetch [%o1 + (6 * BLOCK_SIZE) + 8], #one_read
+ prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read
bgu,pt %ncc, 1b
add %o1, BLOCK_SIZE, %o1 ! update SRC
faligndata %f12, %f14, %f44
faligndata %f14, %f0, %f46
stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache
- cmp %o2, BLOCK_SIZE
+ cmp %o2, BLOCK_SIZE
bne %ncc, 2f ! exactly 1 block remaining?
add %o0, BLOCK_SIZE, %o0 ! update DST
brz,a %o3, 3f ! is SRC double aligned?
ldd [%o1], %f2
-2:
- add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
+2:
+ add %o5, %o2, %o5 ! %o5 was already set to 0 or -8
add %o5, %o3, %o5
membar #StoreLoad|#StoreStore