diff options
author | hyw <none@none> | 2006-04-21 13:06:49 -0700 |
---|---|---|
committer | hyw <none@none> | 2006-04-21 13:06:49 -0700 |
commit | e4896ad21c7454d623b59dc9dc6ecbe7ca47941a (patch) | |
tree | b8eb14a96ec8dde7c2024a3602ce861d24a69dae /usr/src/lib/libc_psr | |
parent | 33355266d8b3ad9ad5d65cc1269f1dabef54bb30 (diff) | |
download | illumos-joyent-e4896ad21c7454d623b59dc9dc6ecbe7ca47941a.tar.gz |
6416186 Additional performance tunings for OPL (prefetch params & libc_psr)
Diffstat (limited to 'usr/src/lib/libc_psr')
-rw-r--r-- | usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s | 536 |
1 files changed, 422 insertions, 114 deletions
diff --git a/usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s b/usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s index cf41195e78..0fe2cbf6af 100644 --- a/usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s +++ b/usr/src/lib/libc_psr/sun4u-opl/common/memcpy.s @@ -26,6 +26,7 @@ .ident "%Z%%M% %I% %E% SMI" .file "memcpy.s" + /* * memcpy(s1, s2, len) * @@ -36,7 +37,7 @@ * Fast assembler language version of the following C-program for memcpy * which represents the `standard' for the C-library. * - * void * + * void * * memcpy(void *s, const void *s0, size_t n) * { * if (n != 0) { @@ -54,22 +55,32 @@ #include <sys/sun4asi.h> #include <sys/trap.h> -#define ICACHE_LINE_SIZE 32 -#define BLOCK_SIZE 64 -#define FPRS_FEF 0x4 +#define ICACHE_LINE_SIZE 64 +#define BLOCK_SIZE 64 +#define FPRS_FEF 0x4 + +#define ALIGNED8_FPCOPY_THRESHOLD 1024 +#define ALIGNED4_FPCOPY_THRESHOLD 1024 +#define BST_THRESHOLD 65536 -#define SHORTCOPY 3 +#define SHORTCOPY 3 #define SMALL_MAX 39 #define MEDIUM_MAX 255 -#define MED_WMAX 256 /* max copy for medium word-aligned case */ -#define MED_MAX 65536 /* max copy for medium longword-aligned case */ +#define MED_WMAX 256 /* max copy for medium word-aligned case */ + +#define N_READS_STRONG 20 +#define N_WRITES_STRONG 22 + ANSI_PRAGMA_WEAK(memmove,function) ANSI_PRAGMA_WEAK(memcpy,function) #include "synonyms.h" + ENTRY(memmove) + prefetch [%o1], N_READS_STRONG + prefetch [%o0], N_WRITES_STRONG cmp %o1, %o0 ! if from address is >= to use forward copy bgeu %ncc, .forcpy ! else use backward if ... sub %o0, %o1, %o4 ! get difference of two addresses @@ -77,70 +88,245 @@ bleu %ncc, .forcpy ! if size is bigger, do overlapped copy nop - ! - ! an overlapped copy that must be done "backwards" - ! -.ovbc: - mov %o0, %g1 ! save dest address for return val + ! + ! an overlapped copy that must be done "backwards" + ! +.ovbc: + mov %o0, %g1 ! save dest address for return val add %o1, %o2, %o1 ! get to end of source space - add %o0, %o2, %o0 ! get to end of destination space + add %o0, %o2, %o0 ! get to end of destination space -.chksize: - cmp %o2, 8 + cmp %o2, 64 bgeu,pn %ncc, .dbalign - andcc %o0, 7, %g0 ! Is DST 8 byte aligned? - + nop + cmp %o2, 4 + blt,pn %ncc, .byte + sub %o2, 3, %o2 +.byte4loop: + ldub [%o1-1], %o3 ! load last byte + stb %o3, [%o0-1] ! store last byte + sub %o1, 4, %o1 + ldub [%o1+2], %o3 ! load 2nd from last byte + stb %o3, [%o0-2] ! store 2nd from last byte + sub %o0, 4, %o0 + ldub [%o1+1], %o3 ! load 3rd from last byte + stb %o3, [%o0+1] ! store 3rd from last byte + subcc %o2, 4, %o2 + ldub [%o1], %o3 ! load 4th from last byte + bgu,pt %ncc, .byte4loop + stb %o3, [%o0] ! store 4th from last byte .byte: -1: deccc %o2 ! decrement count - blu,pn %ncc, .exit ! loop until done - dec %o0 ! decrement to address - dec %o1 ! decrement from address - ldub [%o1], %o3 ! read a byte - ba 1b ! loop until done + addcc %o2, 3, %o2 + bz,pt %ncc, .exit +.byteloop: + dec %o1 ! decrement src address + ldub [%o1], %o3 ! read a byte + dec %o0 ! decrement dst address + deccc %o2 ! decrement count + bgu,pt %ncc, .byteloop ! loop until done stb %o3, [%o0] ! write byte +.exit: + retl + mov %g1, %o0 + .align 16 .dbalign: - bz %ncc, .dbbck + prefetch [%o1 - (4 * BLOCK_SIZE)], #one_read + prefetch [%o0 - (4 * BLOCK_SIZE)], #one_write + andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned + bz,pt %ncc, .dbmed + sub %o2, %o5, %o2 ! update count +.dbalign1: + dec %o1 ! decrement src address + ldub [%o1], %o3 ! read a byte + dec %o0 ! decrement dst address + deccc %o5 ! decrement count + bgu,pt %ncc, .dbalign1 ! loop until done + stb %o3, [%o0] ! store a byte + +! check for src long word alignment +.dbmed: + andcc %o1, 7, %g0 ! chk src long word alignment + bnz,pn %ncc, .dbbck nop - dec %o1 - dec %o0 - dec %o2 - ldub [%o1], %o3 - ba .chksize - stb %o3, [%o0] +! +! Following code is for overlapping copies where src and dest +! are long word aligned +! +! +! For SPARC64-VI, prefetch is effective for both integer and fp register +! operations. There are no benefits in using the fp registers for +! aligned data copying. + +.dbmedl32enter: + subcc %o2, 31, %o2 ! adjust length to allow cc test + ! for end of loop + ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 + nop +.dbmedl32: + ldx [%o1-8], %o4 ! load + prefetch [%o1 - (8 * BLOCK_SIZE)], #one_read + subcc %o2, 32, %o2 ! decrement length count + stx %o4, [%o0-8] ! and store + prefetch [%o0 - (8 * BLOCK_SIZE)], #one_write + ldx [%o1-16], %o3 ! a block of 32 bytes + sub %o1, 32, %o1 ! decrease src ptr by 32 + stx %o3, [%o0-16] + ldx [%o1+8], %o4 + sub %o0, 32, %o0 ! decrease dst ptr by 32 + stx %o4, [%o0+8] + ldx [%o1], %o3 + bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left + stx %o3, [%o0] +.dbmedl31: + addcc %o2, 16, %o2 ! adjust remaining count + ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left + nop ! + ldx [%o1-8], %o4 ! load and store 16 bytes + sub %o1, 16, %o1 ! decrease src ptr by 16 + stx %o4, [%o0-8] ! + sub %o2, 16, %o2 ! decrease count by 16 + ldx [%o1], %o3 ! + sub %o0, 16, %o0 ! decrease dst ptr by 16 + stx %o3, [%o0] +.dbmedl15: + addcc %o2, 15, %o2 ! restore count + bz,pt %ncc, .dbexit ! exit if finished + nop + cmp %o2, 8 + blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left + nop + ldx [%o1-8], %o4 ! load 8 bytes + sub %o1, 8, %o1 ! decrease src ptr by 8 + stx %o4, [%o0-8] ! and store 8 bytes + subcc %o2, 8, %o2 ! decrease count by 8 + bnz %ncc, .dbremain ! exit if finished + sub %o0, 8, %o0 ! decrease dst ptr by 8 + retl + mov %g1, %o0 +! +! Following code is for overlapping copies where src and dest +! are not long word aligned +! + .align 16 .dbbck: - rd %fprs, %o3 ! o3 = fprs - - ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. - ! So set it anyway, without checking. - wr %g0, 0x4, %fprs ! fprs.fef = 1 - - alignaddr %o1, %g0, %o5 ! align src - ldd [%o5], %d0 ! get first 8 byte block - sub %o5, 8, %o5 - andn %o2, 7, %o4 - sub %o1, %o4, %o1 - -2: + rd %fprs, %o3 ! o3 = fprs + + ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. + ! So set it anyway, without checking. + wr %g0, 0x4, %fprs ! fprs.fef = 1 + + alignaddr %o1, %g0, %o5 ! align src + ldd [%o5], %d0 ! get first 8 byte block + andn %o2, 7, %o4 ! prepare src ptr for finishup code + cmp %o2, 32 + blt,pn %ncc, .dbmv8 + sub %o1, %o4, %o1 ! + cmp %o2, 4095 ! check for short memmoves + blt,pn %ncc, .dbmv32enter ! go to no prefetch code +.dbmv64: + ldd [%o5-8], %d2 ! load 8 bytes + ldd [%o5-16], %d4 ! load 8 bytes + sub %o5, 64, %o5 ! + ldd [%o5+40], %d6 ! load 8 bytes + sub %o0, 64, %o0 ! + ldd [%o5+32], %d8 ! load 8 bytes + sub %o2, 64, %o2 ! 64 less bytes to copy + ldd [%o5+24], %d18 ! load 8 bytes + cmp %o2, 64 ! do we have < 64 bytes remaining + ldd [%o5+16], %d28 ! load 8 bytes + ldd [%o5+8], %d30 ! load 8 bytes + faligndata %d2, %d0, %d10 ! extract 8 bytes out + prefetch [%o5 - (5 * BLOCK_SIZE)], #one_read + ldd [%o5], %d0 ! load 8 bytes + std %d10, [%o0+56] ! store the current 8 bytes + faligndata %d4, %d2, %d12 ! extract 8 bytes out + prefetch [%o0 - (5 * BLOCK_SIZE)], #one_write + std %d12, [%o0+48] ! store the current 8 bytes + faligndata %d6, %d4, %d14 ! extract 8 bytes out + std %d14, [%o0+40] ! store the current 8 bytes + faligndata %d8, %d6, %d16 ! extract 8 bytes out + std %d16, [%o0+32] ! store the current 8 bytes + faligndata %d18, %d8, %d20 ! extract 8 bytes out + std %d20, [%o0+24] ! store the current 8 bytes + faligndata %d28, %d18, %d22 ! extract 8 bytes out + std %d22, [%o0+16] ! store the current 8 bytes + faligndata %d30, %d28, %d24 ! extract 8 bytes out + std %d24, [%o0+8] ! store the current 8 bytes + faligndata %d0, %d30, %d26 ! extract 8 bytes out + bgeu,pt %ncc, .dbmv64 + std %d26, [%o0] ! store the current 8 bytes + + cmp %o2, 32 + blt,pn %ncc, .dbmvx + nop +.dbmv32: + ldd [%o5-8], %d2 ! load 8 bytes +.dbmv32enter: + ldd [%o5-16], %d4 ! load 8 bytes + sub %o5, 32, %o5 ! + ldd [%o5+8], %d6 ! load 8 bytes + sub %o0, 32, %o0 ! + faligndata %d2, %d0, %d10 ! extract 8 bytes out + ldd [%o5], %d0 ! load 8 bytes + sub %o2,32, %o2 ! 32 less bytes to copy + std %d10, [%o0+24] ! store the current 8 bytes + cmp %o2, 32 ! do we have < 32 bytes remaining + faligndata %d4, %d2, %d12 ! extract 8 bytes out + std %d12, [%o0+16] ! store the current 8 bytes + faligndata %d6, %d4, %d14 ! extract 8 bytes out + std %d14, [%o0+8] ! store the current 8 bytes + faligndata %d0, %d6, %d16 ! extract 8 bytes out + bgeu,pt %ncc, .dbmv32 + std %d16, [%o0] ! store the current 8 bytes +.dbmvx: + cmp %o2, 8 ! do we have < 8 bytes remaining + blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code + nop +.dbmv8: + ldd [%o5-8], %d2 sub %o0, 8, %o0 ! since we are at the end ! when we first enter the loop - ldd [%o5], %d2 - sub %o2, 8, %o2 ! 8 less bytes to copy + sub %o2, 8, %o2 ! 8 less bytes to copy sub %o5, 8, %o5 cmp %o2, 8 ! do we have < 8 bytes remaining - faligndata %d2, %d0, %d8 ! extract 8 bytes out - std %d8, [%o0] ! store the current 8 bytes - bgeu,pt %ncc, 2b + faligndata %d2, %d0, %d8 ! extract 8 bytes out + std %d8, [%o0] ! store the current 8 bytes + bgeu,pt %ncc, .dbmv8 fmovd %d2, %d0 +.dbmvfinish: + and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 + tst %o2 + bz,pt %ncc, .dbexit + wr %o3, %g0, %fprs ! fprs = o3 restore fprs - and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 - ba .byte - wr %o3, %g0, %fprs ! fprs = o3 restore fprs - -.exit: +.dbremain: + cmp %o2, 4 + blt,pn %ncc, .dbbyte + nop + ldub [%o1-1], %o3 ! load last byte + stb %o3, [%o0-1] ! store last byte + sub %o1, 4, %o1 + ldub [%o1+2], %o3 ! load 2nd from last byte + stb %o3, [%o0-2] ! store 2nd from last byte + sub %o0, 4, %o0 + ldub [%o1+1], %o3 ! load 3rd from last byte + stb %o3, [%o0+1] ! store 3rd from last byte + subcc %o2, 4, %o2 + ldub [%o1], %o3 ! load 4th from last byte + stb %o3, [%o0] ! store 4th from last byte + bz,pt %ncc, .dbexit +.dbbyte: + dec %o1 ! decrement src address + ldub [%o1], %o3 ! read a byte + dec %o0 ! decrement dst address + deccc %o2 ! decrement count + bgu,pt %ncc, .dbbyte ! loop until done + stb %o3, [%o0] ! write byte +.dbexit: retl - mov %g1, %o0 + mov %g1, %o0 SET_SIZE(memmove) @@ -150,8 +336,8 @@ nop ! Do not remove, these nops affect nop ! icache alignment and performance .forcpy: - prefetch [%o1], #n_reads - prefetch [%o0], #n_reads + prefetch [%o1], N_READS_STRONG + prefetch [%o0], N_WRITES_STRONG cmp %o2, SMALL_MAX ! check for not small case bgu,pn %ncc, .medium ! go to larger cases mov %o0, %g1 ! save %o0 @@ -193,7 +379,7 @@ retl mov %g1, %o0 ! restore %o0 - .align 8 + .align 16 nop ! affects loop icache alignment .smallwords: lduw [%o1], %o3 ! read word @@ -243,11 +429,13 @@ mov %g1, %o0 ! restore %o0 .align 16 .medium: + prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read + prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write neg %o0, %o5 - neg %o1, %o3 + neg %o1, %o3 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned and %o3, 7, %o3 ! bytes till SRC 8 byte aligned - + bz %ncc, 2f sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) ! o3={-7, -6, ... 7} o3>0 => SRC overaligned @@ -266,10 +454,10 @@ 2: andcc %o1, 0x3, %g0 ! test alignment - prefetch [%o1 + (1 * BLOCK_SIZE)], #n_reads + prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases ! if src, dst not aligned - prefetch [%o0 + (1 * BLOCK_SIZE)], #n_reads + prefetch [%o0 + (1 * BLOCK_SIZE)], #one_write /* * Handle all cases where src and dest are aligned on word @@ -279,23 +467,22 @@ * to short data moves. */ andcc %o1, 0x7, %g0 ! test word alignment - prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads + prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read bz,pt %ncc, .medlword ! branch to long word aligned case - prefetch [%o0 + (2 * BLOCK_SIZE)], #n_reads - - cmp %o2, MED_WMAX ! limit to store buffer size + prefetch [%o1 + (2 * BLOCK_SIZE)], #one_write + cmp %o2, ALIGNED4_FPCOPY_THRESHOLD ! limit to store buffer size bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop - prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads + prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read subcc %o2, 15, %o2 ! adjust length to allow cc test - prefetch [%o0 + (3 * BLOCK_SIZE)], #n_reads + prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write ! for end of loop ble,pt %ncc, .medw15 ! skip big loop if less than 16 .empty .medw16: - prefetch [%o1 + (4 * BLOCK_SIZE)], #n_reads + prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read ld [%o1], %o4 ! load subcc %o2, 16, %o2 ! decrement length count - prefetch [%o0 + (4 * BLOCK_SIZE)], #n_reads + prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write stw %o4, [%o0] ! and store ld [%o1+4], %o3 ! a block of 16 bytes add %o1, 16, %o1 ! increase src ptr by 16 @@ -348,30 +535,30 @@ .medwexit: retl mov %g1, %o0 ! restore %o0 - + /* * Special case for handling when src and dest are both long word aligned - * and total data to move is between SMALL_MAX and MED_MAX bytes + * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD + * bytes. */ .align 16 nop .medlword: ! long word aligned - ! length > SMALL_MAX - set MED_MAX, %o4 - cmp %o2, %o4 + ! length > ALIGNED8_FPCOPY_THRESHOLD + cmp %o2, ALIGNED8_FPCOPY_THRESHOLD bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop - prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads - prefetch [%o0 + (3 * BLOCK_SIZE)], #n_reads + prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read + prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write subcc %o2, 31, %o2 ! adjust length to allow cc test ! for end of loop ble,pt %ncc, .medl31 ! skip big loop if less than 32 .empty .medl32: - prefetch [%o1 + (4 * BLOCK_SIZE)], #n_reads + prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read ldx [%o1], %o4 ! load subcc %o2, 32, %o2 ! decrement length count - prefetch [%o0 + (4 * BLOCK_SIZE)], #n_reads + prefetch [%o0 + (4 * BLOCK_SIZE)], #one_read stx %o4, [%o0] ! and store ldx [%o1+8], %o3 ! a block of 32 bytes add %o1, 32, %o1 ! increase src ptr by 32 @@ -409,25 +596,26 @@ ba .medw7 nop - .align 8 + .align 16 + nop + nop + nop .mediumsetup: prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read .mediumrejoin: rd %fprs, %o4 ! check for unused FPU - + add %o1, 8, %o1 ! prepare to round SRC upward - sethi %hi(0x1234567f), %o5 ! For GSR.MASK + sethi %hi(0x1234567f), %o5 ! For GSR.MASK or %o5, 0x67f, %o5 andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 - prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read bz,a %ncc, 3f wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 3: cmp %o2, MEDIUM_MAX - prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read bmask %o5, %g0, %g0 ! Compute o5 (number of bytes that need copying using the main loop). @@ -466,8 +654,7 @@ andn %o5, 7, %o5 ! 8 byte aligned count neg %o0, %o5 ! 'large' case and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned -4: - prefetch [%o1 + (6 * BLOCK_SIZE)], #one_read +4: brgez,a %o3, .beginmedloop ldd [%o1-8], %d0 @@ -479,14 +666,13 @@ bnz %ncc, 5b bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 -.beginmedloop: - prefetch [%o1 + (7 * BLOCK_SIZE)], #one_read +.beginmedloop: tst %o5 bz %ncc, .endmedloop sub %o2, %o5, %o2 ! update count for later ! Main loop to write out doubles. Note: o5 & 7 == 0 - + ldx [%o1], %d2 subcc %o5, 8, %o5 ! update local count bz,pn %ncc, 1f @@ -506,28 +692,27 @@ bnz,pt %ncc, .medloop add %o0, 16, %o0 ! update DST -1: +1: faligndata %d0, %d2, %d4 fmovd %d2, %d0 std %d4, [%o0] ba .endmedloop add %o0, 8, %o0 - + 2: std %d6, [%o0 + 8] sub %o1, 8, %o1 add %o0, 16, %o0 - + .endmedloop: ! Currently, o1 is pointing to the next double-aligned byte in SRC ! The 8 bytes starting at [o1-8] are available in d0 ! At least one, and possibly all, of these need to be written. - prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read - cmp %o2, BLOCK_SIZE + cmp %o2, BLOCK_SIZE bgu %ncc, .large ! otherwise, less than 16 bytes left - + #if 0 /* This code will use partial stores. */ @@ -537,13 +722,13 @@ ! fill %d0 with good (unwritten) data. subcc %o2, 8, %o2 ! update count (maybe too much) - movl %ncc, %o2, %o5 + movl %ncc, %o2, %o5 addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) bz %ncc, 2f alignaddr %o3, %g0, %g0 ! set GSR.ALIGN - + 1: deccc %o5 ldda [%o1]ASI_FL8_P, %d2 @@ -552,14 +737,14 @@ bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 2: - not %o3 + not %o3 faligndata %d0, %d0, %d0 ! shift bytes to the left and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] edge8n %g0, %o3, %o5 stda %d0, [%o0]%o5, ASI_PST8_P - brlez %o2, .mediumexit + brlez %o2, .mediumexit add %o0, %o3, %o0 ! update DST to last stored byte -3: +3: inc %o0 deccc %o2 ldub [%o1], %o3 @@ -584,27 +769,27 @@ bgu %ncc, 1b bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 -2: +2: subcc %o2, 8, %o2 std %d0, [%o0] bz %ncc, .mediumexit add %o0, 8, %o0 -3: +3: ldub [%o1], %o3 deccc %o2 inc %o1 stb %o3, [%o0] bgu %ncc, 3b inc %o0 -#endif +#endif .mediumexit: - wr %o4, %g0, %fprs ! fprs = o4 restore fprs + wr %o4, %g0, %fprs ! fprs = o4 restore fprs retl - mov %g1, %o0 + mov %g1, %o0 - .align 8 + .align ICACHE_LINE_SIZE .large: ! %o0 I/O DST is 64-byte aligned @@ -615,10 +800,131 @@ ! %o4 I Not written. Holds fprs. ! %o5 O The number of doubles that remain to be written. - ! Load the rest of the current block + ! Load the rest of the current block + ! Recall that %o1 is further into SRC than %o0 is into DST + + prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read + prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read + + set BST_THRESHOLD, %o5 + cmp %o2, %o5 + bgu,pn %icc, .xlarge + prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read + + ldd [%o1], %f2 + ldd [%o1 + 0x8], %f4 + faligndata %f0, %f2, %f32 + ldd [%o1 + 0x10], %f6 + faligndata %f2, %f4, %f34 + ldd [%o1 + 0x18], %f8 + faligndata %f4, %f6, %f36 + ldd [%o1 + 0x20], %f10 + or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 + faligndata %f6, %f8, %f38 + prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read + ldd [%o1 + 0x28], %f12 + movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) + faligndata %f8, %f10, %f40 + ldd [%o1 + 0x30], %f14 + faligndata %f10, %f12, %f42 + ldd [%o1 + 0x38], %f0 + sub %o2, BLOCK_SIZE, %o2 ! update count + add %o1, BLOCK_SIZE, %o1 ! update SRC + + ! Main loop. Write previous block. Load rest of current block. + ! Some bytes will be loaded that won't yet be written. +1: + ldd [%o1], %f2 + faligndata %f12, %f14, %f44 + ldd [%o1 + 0x8], %f4 + faligndata %f14, %f0, %f46 + std %f32, [%o0] + std %f34, [%o0+8] + std %f36, [%o0+16] + std %f38, [%o0+24] + std %f40, [%o0+32] + std %f42, [%o0+40] + std %f44, [%o0+48] + std %f46, [%o0+56] + sub %o2, BLOCK_SIZE, %o2 ! update count + prefetch [%o1 + (24 * BLOCK_SIZE) + BLOCK_SIZE], #one_read + add %o0, BLOCK_SIZE, %o0 ! update DST + ldd [%o1 + 0x10], %f6 + faligndata %f0, %f2, %f32 + ldd [%o1 + 0x18], %f8 + faligndata %f2, %f4, %f34 + ldd [%o1 + 0x20], %f10 + faligndata %f4, %f6, %f36 + ldd [%o1 + 0x28], %f12 + faligndata %f6, %f8, %f38 + ldd [%o1 + 0x30], %f14 + faligndata %f8, %f10, %f40 + ldd [%o1 + 0x38], %f0 + faligndata %f10, %f12, %f42 + prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read + cmp %o2, BLOCK_SIZE + 8 + prefetch [%o0 + (18 * BLOCK_SIZE)], #one_write + bgu,pt %ncc, 1b + add %o1, BLOCK_SIZE, %o1 ! update SRC + faligndata %f12, %f14, %f44 + faligndata %f14, %f0, %f46 + stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache + cmp %o2, BLOCK_SIZE + bne %ncc, 2f ! exactly 1 block remaining? + add %o0, BLOCK_SIZE, %o0 ! update DST + brz,a %o3, 3f ! is SRC double aligned? + ldd [%o1], %f2 + +2: + add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 + add %o5, %o3, %o5 + + membar #StoreLoad|#StoreStore + + ba .beginmedloop + andn %o5, 7, %o5 ! 8 byte aligned count + + + ! This is when there is exactly 1 block remaining and SRC is aligned +3: + ldd [%o1 + 0x8], %f4 + ldd [%o1 + 0x10], %f6 + fsrc1 %f0, %f32 + ldd [%o1 + 0x18], %f8 + fsrc1 %f2, %f34 + ldd [%o1 + 0x20], %f10 + fsrc1 %f4, %f36 + ldd [%o1 + 0x28], %f12 + fsrc1 %f6, %f38 + ldd [%o1 + 0x30], %f14 + fsrc1 %f8, %f40 + fsrc1 %f10, %f42 + fsrc1 %f12, %f44 + fsrc1 %f14, %f46 + stda %f32, [%o0]ASI_BLK_P + membar #StoreLoad|#StoreStore + wr %o4, 0, %fprs + retl + mov %g1, %o0 + + + .align 16 + ! two nops here causes loop starting at 1f below to be + ! on a cache line boundary, improving performance + nop + nop +.xlarge: + ! %o0 I/O DST is 64-byte aligned + ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) + ! %d0 I/O already loaded with SRC data from [%o1-8] + ! %o2 I/O count (number of bytes that need to be written) + ! %o3 I Not written. If zero, then SRC is double aligned. + ! %o4 I Not written. Holds fprs. + ! %o5 O The number of doubles that remain to be written. + + ! Load the rest of the current block ! Recall that %o1 is further into SRC than %o0 is into DST - prefetch [%o1 + (9 * BLOCK_SIZE)], #one_read ldd [%o1], %f2 ldd [%o1 + 0x8], %f4 faligndata %f0, %f2, %f32 @@ -627,20 +933,22 @@ ldd [%o1 + 0x18], %f8 faligndata %f4, %f6, %f36 ldd [%o1 + 0x20], %f10 - or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 + or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 faligndata %f6, %f8, %f38 ldd [%o1 + 0x28], %f12 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) + prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read faligndata %f8, %f10, %f40 ldd [%o1 + 0x30], %f14 faligndata %f10, %f12, %f42 ldd [%o1 + 0x38], %f0 + prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read sub %o2, BLOCK_SIZE, %o2 ! update count add %o1, BLOCK_SIZE, %o1 ! update SRC ! This point is 32-byte aligned since 24 instructions appear since ! the previous alignment directive. - + ! Main loop. Write previous block. Load rest of current block. ! Some bytes will be loaded that won't yet be written. @@ -664,25 +972,25 @@ faligndata %f8, %f10, %f40 ldd [%o1 + 0x38], %f0 faligndata %f10, %f12, %f42 - prefetch [%o1 + (10 * BLOCK_SIZE)], #one_read + prefetch [%o1 + (25 * BLOCK_SIZE)], #one_read add %o0, BLOCK_SIZE, %o0 ! update DST cmp %o2, BLOCK_SIZE + 8 ! second prefetch important to correct for occasional dropped - prefetch [%o1 + (6 * BLOCK_SIZE) + 8], #one_read + prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read bgu,pt %ncc, 1b add %o1, BLOCK_SIZE, %o1 ! update SRC faligndata %f12, %f14, %f44 faligndata %f14, %f0, %f46 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache - cmp %o2, BLOCK_SIZE + cmp %o2, BLOCK_SIZE bne %ncc, 2f ! exactly 1 block remaining? add %o0, BLOCK_SIZE, %o0 ! update DST brz,a %o3, 3f ! is SRC double aligned? ldd [%o1], %f2 -2: - add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 +2: + add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 add %o5, %o3, %o5 membar #StoreLoad|#StoreStore |