diff options
Diffstat (limited to 'usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s')
-rw-r--r-- | usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s | 1053 |
1 files changed, 1053 insertions, 0 deletions
diff --git a/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s b/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s new file mode 100644 index 0000000000..5b8bbff7cc --- /dev/null +++ b/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s @@ -0,0 +1,1053 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + */ + + .file "memcpy.s" + +/* + * memcpy(s1, s2, len) + * + * Copy s2 to s1, always copy n bytes. + * Note: this C code does not work for overlapped copies. + * Memmove() and bcopy() do. + * + * Fast assembler language version of the following C-program for memcpy + * which represents the `standard' for the C-library. + * + * void * + * memcpy(void *s, const void *s0, size_t n) + * { + * if (n != 0) { + * char *s1 = s; + * const char *s2 = s0; + * do { + * *s1++ = *s2++; + * } while (--n != 0); + * } + * return (s); + * } + */ + +#include <sys/asm_linkage.h> +#include <sys/sun4asi.h> +#include <sys/trap.h> + +#define ICACHE_LINE_SIZE 64 +#define BLOCK_SIZE 64 +#define FPRS_FEF 0x4 + +#define SHORTCOPY 3 +#define SMALL_MAX 39 +#define MEDIUM_MAX 255 +#define MED_WMAX 256 /* max copy for medium word-aligned case */ +#define MED_MAX 256 /* max copy for medium longword-aligned case */ + +#ifndef BSTORE_SIZE +#define BSTORE_SIZE 256 /* min copy size for block store */ +#endif + + ANSI_PRAGMA_WEAK(memmove,function) + ANSI_PRAGMA_WEAK(memcpy,function) + + ENTRY(memmove) + cmp %o1, %o0 ! if from address is >= to use forward copy + bgeu %ncc, .forcpy ! else use backward if ... + sub %o0, %o1, %o4 ! get difference of two addresses + cmp %o2, %o4 ! compare size and difference of addresses + bleu %ncc, .forcpy ! if size is bigger, do overlapped copy + nop + + ! + ! an overlapped copy that must be done "backwards" + ! +.ovbc: + mov %o0, %g1 ! save dest address for return val + add %o1, %o2, %o1 ! get to end of source space + add %o0, %o2, %o0 ! get to end of destination space + + cmp %o2, 24 + bgeu,pn %ncc, .dbalign + nop + cmp %o2, 4 + blt,pn %ncc, .byte + sub %o2, 3, %o2 +.byte4loop: + ldub [%o1-1], %o3 ! load last byte + stb %o3, [%o0-1] ! store last byte + sub %o1, 4, %o1 + ldub [%o1+2], %o3 ! load 2nd from last byte + stb %o3, [%o0-2] ! store 2nd from last byte + sub %o0, 4, %o0 + ldub [%o1+1], %o3 ! load 3rd from last byte + stb %o3, [%o0+1] ! store 3rd from last byte + subcc %o2, 4, %o2 + ldub [%o1], %o3 ! load 4th from last byte + bgu,pt %ncc, .byte4loop + stb %o3, [%o0] ! store 4th from last byte +.byte: + addcc %o2, 3, %o2 + bz,pt %ncc, .exit +.byteloop: + dec %o1 ! decrement src address + ldub [%o1], %o3 ! read a byte + dec %o0 ! decrement dst address + deccc %o2 ! decrement count + bgu,pt %ncc, .byteloop ! loop until done + stb %o3, [%o0] ! write byte +.exit: + retl + mov %g1, %o0 + + .align 16 +.dbalign: + andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned + bz,pt %ncc, .dbmed + sub %o2, %o5, %o2 ! update count +.dbalign1: + dec %o1 ! decrement src address + ldub [%o1], %o3 ! read a byte + dec %o0 ! decrement dst address + deccc %o5 ! decrement count + bgu,pt %ncc, .dbalign1 ! loop until done + stb %o3, [%o0] ! store a byte + +! check for src long word alignment +.dbmed: + andcc %o1, 7, %g0 ! chk src long word alignment + bnz,pn %ncc, .dbbck + nop +! +! Following code is for overlapping copies where src and dest +! are long word aligned +! + cmp %o2, 4095 + blt,pn %ncc, .dbmedl32enter ! go to no prefetch code + nop + prefetch [%o1 - (1 * BLOCK_SIZE)], 20 ! into the prefetch cache + sub %o2, 63, %o2 ! adjust length to allow cc test + ! for end of loop + prefetch [%o1 - (2 * BLOCK_SIZE)], 20 ! into the prefetch cache + rd %fprs, %o3 ! o3 = fprs + ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. + ! So set it anyway, without checking. + prefetch [%o1 - (3 * BLOCK_SIZE)], 20 ! into the prefetch cache + wr %g0, 0x4, %fprs ! fprs.fef = 1 + prefetch [%o1 - (4 * BLOCK_SIZE)], 20 ! into the prefetch cache +.dbmedl64: + prefetch [%o1 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache + ldd [%o1-8], %d4 ! load + subcc %o2, 64, %o2 ! decrement length count + std %d4, [%o0-8] ! and store + ldd [%o1-16], %d2 ! a block of 64 bytes + sub %o1, 64, %o1 ! decrease src ptr by 64 + std %d2, [%o0-16] + sub %o0, 64, %o0 ! decrease dst ptr by 64 + ldd [%o1+40], %d4 + std %d4, [%o0+40] + ldd [%o1+32], %d2 + std %d2, [%o0+32] + ldd [%o1+24], %d4 + std %d4, [%o0+24] + ldd [%o1+16], %d2 + std %d2, [%o0+16] + ldd [%o1+8], %d4 + std %d4, [%o0+8] + ldd [%o1], %d2 + bgu,pt %ncc, .dbmedl64 ! repeat if at least 64 bytes left + std %d2, [%o0] + add %o2, 63, %o2 ! restore offset adjustment + and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 + wr %o3, %g0, %fprs ! fprs = o3 restore fprs +.dbmedl32enter: + subcc %o2, 31, %o2 ! adjust length to allow cc test + ! for end of loop + ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 + nop +.dbmedl32: + ldx [%o1-8], %o4 ! load + subcc %o2, 32, %o2 ! decrement length count + stx %o4, [%o0-8] ! and store + ldx [%o1-16], %o3 ! a block of 32 bytes + sub %o1, 32, %o1 ! decrease src ptr by 32 + stx %o3, [%o0-16] + ldx [%o1+8], %o4 + sub %o0, 32, %o0 ! decrease dst ptr by 32 + stx %o4, [%o0+8] + ldx [%o1], %o3 + bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left + stx %o3, [%o0] +.dbmedl31: + addcc %o2, 16, %o2 ! adjust remaining count + ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left + nop ! + ldx [%o1-8], %o4 ! load and store 16 bytes + sub %o1, 16, %o1 ! decrease src ptr by 16 + stx %o4, [%o0-8] ! + sub %o2, 16, %o2 ! decrease count by 16 + ldx [%o1], %o3 ! + sub %o0, 16, %o0 ! decrease dst ptr by 16 + stx %o3, [%o0] +.dbmedl15: + addcc %o2, 15, %o2 ! restore count + bz,pt %ncc, .dbexit ! exit if finished + nop + cmp %o2, 8 + blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left + nop + ldx [%o1-8], %o4 ! load 8 bytes + sub %o1, 8, %o1 ! decrease src ptr by 8 + stx %o4, [%o0-8] ! and store 8 bytes + subcc %o2, 8, %o2 ! decrease count by 8 + bnz %ncc, .dbremain ! exit if finished + sub %o0, 8, %o0 ! decrease dst ptr by 8 + retl + mov %g1, %o0 + +! +! Following code is for overlapping copies where src and dest +! are not long word aligned +! + .align 16 +.dbbck: + rd %fprs, %o3 ! o3 = fprs + + ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. + ! So set it anyway, without checking. + wr %g0, 0x4, %fprs ! fprs.fef = 1 + + alignaddr %o1, %g0, %o5 ! align src + ldd [%o5], %d0 ! get first 8 byte block + andn %o2, 7, %o4 ! prepare src ptr for finishup code + cmp %o2, 32 + blt,pn %ncc, .dbmv8 + sub %o1, %o4, %o1 ! + cmp %o2, 4095 ! check for short memmoves + blt,pn %ncc, .dbmv32enter ! go to no prefetch code +.dbmv64: + ldd [%o5-8], %d2 ! load 8 bytes + ldd [%o5-16], %d4 ! load 8 bytes + sub %o5, 64, %o5 ! + ldd [%o5+40], %d6 ! load 8 bytes + sub %o0, 64, %o0 ! + ldd [%o5+32], %d8 ! load 8 bytes + sub %o2, 64, %o2 ! 64 less bytes to copy + ldd [%o5+24], %d18 ! load 8 bytes + cmp %o2, 64 ! do we have < 64 bytes remaining + ldd [%o5+16], %d28 ! load 8 bytes + ldd [%o5+8], %d30 ! load 8 bytes + prefetch [%o5 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache + faligndata %d2, %d0, %d10 ! extract 8 bytes out + ldd [%o5], %d0 ! load 8 bytes + std %d10, [%o0+56] ! store the current 8 bytes + faligndata %d4, %d2, %d12 ! extract 8 bytes out + std %d12, [%o0+48] ! store the current 8 bytes + faligndata %d6, %d4, %d14 ! extract 8 bytes out + std %d14, [%o0+40] ! store the current 8 bytes + faligndata %d8, %d6, %d16 ! extract 8 bytes out + std %d16, [%o0+32] ! store the current 8 bytes + faligndata %d18, %d8, %d20 ! extract 8 bytes out + std %d20, [%o0+24] ! store the current 8 bytes + faligndata %d28, %d18, %d22 ! extract 8 bytes out + std %d22, [%o0+16] ! store the current 8 bytes + faligndata %d30, %d28, %d24 ! extract 8 bytes out + std %d24, [%o0+8] ! store the current 8 bytes + faligndata %d0, %d30, %d26 ! extract 8 bytes out + bgeu,pt %ncc, .dbmv64 + std %d26, [%o0] ! store the current 8 bytes + + cmp %o2, 32 + blt,pn %ncc, .dbmvx + nop +.dbmv32: + ldd [%o5-8], %d2 ! load 8 bytes +.dbmv32enter: + ldd [%o5-16], %d4 ! load 8 bytes + sub %o5, 32, %o5 ! + ldd [%o5+8], %d6 ! load 8 bytes + sub %o0, 32, %o0 ! + faligndata %d2, %d0, %d10 ! extract 8 bytes out + ldd [%o5], %d0 ! load 8 bytes + sub %o2,32, %o2 ! 32 less bytes to copy + std %d10, [%o0+24] ! store the current 8 bytes + cmp %o2, 32 ! do we have < 32 bytes remaining + faligndata %d4, %d2, %d12 ! extract 8 bytes out + std %d12, [%o0+16] ! store the current 8 bytes + faligndata %d6, %d4, %d14 ! extract 8 bytes out + std %d14, [%o0+8] ! store the current 8 bytes + faligndata %d0, %d6, %d16 ! extract 8 bytes out + bgeu,pt %ncc, .dbmv32 + std %d16, [%o0] ! store the current 8 bytes +.dbmvx: + cmp %o2, 8 ! do we have < 8 bytes remaining + blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code + nop +.dbmv8: + ldd [%o5-8], %d2 + sub %o0, 8, %o0 ! since we are at the end + ! when we first enter the loop + sub %o2, 8, %o2 ! 8 less bytes to copy + sub %o5, 8, %o5 + cmp %o2, 8 ! do we have < 8 bytes remaining + faligndata %d2, %d0, %d8 ! extract 8 bytes out + std %d8, [%o0] ! store the current 8 bytes + bgeu,pt %ncc, .dbmv8 + fmovd %d2, %d0 +.dbmvfinish: + and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 + tst %o2 + bz,pt %ncc, .dbexit + wr %o3, %g0, %fprs ! fprs = o3 restore fprs + +.dbremain: + cmp %o2, 4 + blt,pn %ncc, .dbbyte + nop + ldub [%o1-1], %o3 ! load last byte + stb %o3, [%o0-1] ! store last byte + sub %o1, 4, %o1 + ldub [%o1+2], %o3 ! load 2nd from last byte + stb %o3, [%o0-2] ! store 2nd from last byte + sub %o0, 4, %o0 + ldub [%o1+1], %o3 ! load 3rd from last byte + stb %o3, [%o0+1] ! store 3rd from last byte + subcc %o2, 4, %o2 + ldub [%o1], %o3 ! load 4th from last byte + stb %o3, [%o0] ! store 4th from last byte + bz,pt %ncc, .dbexit +.dbbyte: + dec %o1 ! decrement src address + ldub [%o1], %o3 ! read a byte + dec %o0 ! decrement dst address + deccc %o2 ! decrement count + bgu,pt %ncc, .dbbyte ! loop until done + stb %o3, [%o0] ! write byte +.dbexit: + retl + mov %g1, %o0 + SET_SIZE(memmove) + + + .align ICACHE_LINE_SIZE + ENTRY(memcpy) + ! adjust instruction alignment + nop ! Do not remove, these nops affect + nop ! icache alignment and performance +.forcpy: + cmp %o2, SMALL_MAX ! check for not small case + bgu,pn %ncc, .medium ! go to larger cases + mov %o0, %g1 ! save %o0 + cmp %o2, SHORTCOPY ! check for really short case + ble,pt %ncc, .smallleft ! + or %o0, %o1, %o3 ! prepare alignment check + andcc %o3, 0x3, %g0 ! test for alignment + bz,pt %ncc, .smallword ! branch to word aligned case + sub %o2, 3, %o2 ! adjust count to allow cc zero test +.smallnotalign4: + ldub [%o1], %o3 ! read byte + subcc %o2, 4, %o2 ! reduce count by 4 + stb %o3, [%o0] ! write byte + ldub [%o1+1], %o3 ! repeat for a total of 4 bytes + add %o1, 4, %o1 ! advance SRC by 4 + stb %o3, [%o0+1] + ldub [%o1-2], %o3 + add %o0, 4, %o0 ! advance DST by 4 + stb %o3, [%o0-2] + ldub [%o1-1], %o3 + bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain + stb %o3, [%o0-1] + add %o2, 3, %o2 ! restore count +.smallleft: + tst %o2 + bz,pt %ncc, .smallexit + nop +.smallleft3: ! 1, 2, or 3 bytes remain + ldub [%o1], %o3 ! load one byte + deccc %o2 ! reduce count for cc test + bz,pt %ncc, .smallexit + stb %o3, [%o0] ! store one byte + ldub [%o1+1], %o3 ! load second byte + deccc %o2 + bz,pt %ncc, .smallexit + stb %o3, [%o0+1] ! store second byte + ldub [%o1+2], %o3 ! load third byte + stb %o3, [%o0+2] ! store third byte + retl + mov %g1, %o0 ! restore %o0 + + .align 16 + nop ! affects loop icache alignment +.smallwords: + lduw [%o1], %o3 ! read word +.smallwordx: + subcc %o2, 8, %o2 ! update count + stw %o3, [%o0] ! write word + add %o1, 8, %o1 ! update SRC + lduw [%o1-4], %o3 ! read word + add %o0, 8, %o0 ! update DST + bgu,pt %ncc, .smallwords ! loop until done + stw %o3, [%o0-4] ! write word + addcc %o2, 7, %o2 ! restore count + bz,pt %ncc, .smallexit ! check for completion + nop + cmp %o2, 4 ! check for 4 or more bytes left + blt .smallleft3 ! if not, go to finish up + nop + lduw [%o1], %o3 + add %o1, 4, %o1 + subcc %o2, 4, %o2 + stw %o3, [%o0] + add %o0, 4, %o0 + bnz,pt %ncc, .smallleft3 + nop + retl + mov %g1, %o0 ! restore %o0 + +.smallword: + subcc %o2, 4, %o2 ! update count + bgu,pt %ncc, .smallwordx + lduw [%o1], %o3 ! read word + addcc %o2, 3, %o2 ! restore count + bz,pt %ncc, .smallexit + stw %o3, [%o0] ! write word + deccc %o2 ! reduce count for cc test + ldub [%o1+4], %o3 ! load one byte + bz,pt %ncc, .smallexit + stb %o3, [%o0+4] ! store one byte + ldub [%o1+5], %o3 ! load second byte + deccc %o2 + bz,pt %ncc, .smallexit + stb %o3, [%o0+5] ! store second byte + ldub [%o1+6], %o3 ! load third byte + stb %o3, [%o0+6] ! store third byte +.smallexit: + retl + mov %g1, %o0 ! restore %o0 + .align 16 +.medium: + neg %o0, %o5 + neg %o1, %o3 + andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned + and %o3, 7, %o3 ! bytes till SRC 8 byte aligned + + bz %ncc, 2f + sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) + ! o3={-7, -6, ... 7} o3>0 => SRC overaligned + + sub %o2, %o5, %o2 ! update count + +1: + ldub [%o1], %o4 + deccc %o5 + inc %o1 + stb %o4, [%o0] + bgu,pt %ncc, 1b + inc %o0 + + ! Now DST is 8-byte aligned. o0, o1, o2 are current. + +2: + andcc %o1, 0x3, %g0 ! test alignment + bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases + ! if src, dst not aligned + prefetch [%o1 + (1 * BLOCK_SIZE)], 20 + +/* + * Handle all cases where src and dest are aligned on word + * or long word boundaries. Use unrolled loops for better + * performance. This option wins over standard large data + * move when source and destination is in cache for medium + * to short data moves. + */ + andcc %o1, 0x7, %g0 ! test word alignment + bz,pt %ncc, .medlword ! branch to long word aligned case + prefetch [%o1 + (2 * BLOCK_SIZE)], 20 + cmp %o2, MED_WMAX ! limit to store buffer size + bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop + nop + subcc %o2, 15, %o2 ! adjust length to allow cc test + ! for end of loop + ble,pt %ncc, .medw15 ! skip big loop if less than 16 + prefetch [%o1 + (3 * BLOCK_SIZE)], 20 +/* + * no need to put prefetch in loop as prefetches have + * already been issued for maximum loop size + */ +.medw16: + ld [%o1], %o4 ! load + subcc %o2, 16, %o2 ! decrement length count + stw %o4, [%o0] ! and store + ld [%o1+4], %o3 ! a block of 16 bytes + add %o1, 16, %o1 ! increase src ptr by 16 + stw %o3, [%o0+4] + ld [%o1-8], %o4 + add %o0, 16, %o0 ! increase dst ptr by 16 + stw %o4, [%o0-8] + ld [%o1-4], %o3 + bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left + stw %o3, [%o0-4] +.medw15: + addcc %o2, 15, %o2 ! restore count + bz,pt %ncc, .medwexit ! exit if finished + nop + cmp %o2, 8 + blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left + nop ! + ld [%o1], %o4 ! load 4 bytes + subcc %o2, 8, %o2 ! decrease count by 8 + stw %o4, [%o0] ! and store 4 bytes + add %o1, 8, %o1 ! increase src ptr by 8 + ld [%o1-4], %o3 ! load 4 bytes + add %o0, 8, %o0 ! increase dst ptr by 8 + stw %o3, [%o0-4] ! and store 4 bytes + bz %ncc, .medwexit ! exit if finished + nop +.medw7: ! count is ge 1, less than 8 + cmp %o2, 3 ! check for 4 bytes left + ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left + nop ! + ld [%o1], %o4 ! load 4 bytes + sub %o2, 4, %o2 ! decrease count by 4 + add %o1, 4, %o1 ! increase src ptr by 4 + stw %o4, [%o0] ! and store 4 bytes + add %o0, 4, %o0 ! increase dst ptr by 4 + tst %o2 ! check for zero bytes left + bz %ncc, .medwexit ! exit if finished + nop +.medw3: ! count is known to be 1, 2, or 3 + deccc %o2 ! reduce count by one + ldub [%o1], %o3 ! load one byte + bz,pt %ncc, .medwexit ! exit if last byte + stb %o3, [%o0] ! store one byte + ldub [%o1+1], %o3 ! load second byte + deccc %o2 ! reduce count by one + bz,pt %ncc, .medwexit ! exit if last byte + stb %o3, [%o0+1] ! store second byte + ldub [%o1+2], %o3 ! load third byte + stb %o3, [%o0+2] ! store third byte +.medwexit: + retl + mov %g1, %o0 ! restore %o0 + +/* + * Special case for handling when src and dest are both long word aligned + * and total data to move is between SMALL_MAX and MED_MAX bytes + */ + + .align 16 + nop +.medlword: ! long word aligned + ! length > SMALL_MAX + cmp %o2, MED_MAX ! limit to store buffer size + bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop + nop + subcc %o2, 31, %o2 ! adjust length to allow cc test + ! for end of loop + ble,pt %ncc, .medl31 ! skip big loop if less than 32 + prefetch [%o1 + (3 * BLOCK_SIZE)], 20 ! into the l2 cache +/* + * no need to put prefetch in loop as prefetches have + * already been issued for maximum loop size + */ +.medl32: + ldx [%o1], %o4 ! load + subcc %o2, 32, %o2 ! decrement length count + stx %o4, [%o0] ! and store + ldx [%o1+8], %o3 ! a block of 32 bytes + add %o1, 32, %o1 ! increase src ptr by 32 + stx %o3, [%o0+8] + ldx [%o1-16], %o4 + add %o0, 32, %o0 ! increase dst ptr by 32 + stx %o4, [%o0-16] + ldx [%o1-8], %o3 + bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left + stx %o3, [%o0-8] +.medl31: + addcc %o2, 16, %o2 ! adjust remaining count + ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left + nop ! + ldx [%o1], %o4 ! load and store 16 bytes + add %o1, 16, %o1 ! increase src ptr by 16 + stx %o4, [%o0] ! + sub %o2, 16, %o2 ! decrease count by 16 + ldx [%o1-8], %o3 ! + add %o0, 16, %o0 ! increase dst ptr by 16 + stx %o3, [%o0-8] +.medl15: + addcc %o2, 15, %o2 ! restore count + bz,pt %ncc, .medwexit ! exit if finished + nop + cmp %o2, 8 + blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left + nop + ldx [%o1], %o4 ! load 8 bytes + add %o1, 8, %o1 ! increase src ptr by 8 + stx %o4, [%o0] ! and store 8 bytes + subcc %o2, 8, %o2 ! decrease count by 8 + bz %ncc, .medwexit ! exit if finished + add %o0, 8, %o0 ! increase dst ptr by 8 + ba .medw7 + nop + + .align 16 + nop + nop + nop +.mediumsetup: + prefetch [%o1 + (2 * BLOCK_SIZE)], 21 +.mediumrejoin: + rd %fprs, %o4 ! check for unused FPU + + add %o1, 8, %o1 ! prepare to round SRC upward + + sethi %hi(0x1234567f), %o5 ! For GSR.MASK + or %o5, 0x67f, %o5 + + andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 + bz,a %ncc, 3f + wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 +3: + cmp %o2, MEDIUM_MAX + bmask %o5, %g0, %g0 + + ! Compute o5 (number of bytes that need copying using the main loop). + ! First, compute for the medium case. + ! Then, if large case, o5 is replaced by count for block alignment. + ! Be careful not to read past end of SRC + ! Currently, o2 is the actual count remaining + ! o3 is how much sooner we'll cross the alignment boundary + ! in SRC compared to in DST + ! + ! Examples: Let # denote bytes that should not be accessed + ! Let x denote a byte already copied to align DST + ! Let . and - denote bytes not yet copied + ! Let | denote double alignment boundaries + ! + ! DST: ######xx|........|--------|..###### o2 = 18 + ! o0 + ! + ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 + ! o1 + ! + ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 + ! o1 + ! + ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 + ! o1 + + or %g0, -8, %o5 + alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 + + movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 + add %o5, %o2, %o5 + add %o5, %o3, %o5 + + bleu %ncc, 4f + andn %o5, 7, %o5 ! 8 byte aligned count + neg %o0, %o5 ! 'large' case + and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned +4: + brgez,a %o3, .beginmedloop + ldd [%o1-8], %d0 + + add %o1, %o3, %o1 ! back up o1 +5: + ldda [%o1]ASI_FL8_P, %d2 + inc %o1 + andcc %o1, 7, %g0 + bnz %ncc, 5b + bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 + +.beginmedloop: + tst %o5 + bz %ncc, .endmedloop + sub %o2, %o5, %o2 ! update count for later + + ! Main loop to write out doubles. Note: o5 & 7 == 0 + + ldd [%o1], %d2 + subcc %o5, 8, %o5 ! update local count + bz,pn %ncc, 1f + add %o1, 8, %o1 ! update SRC + +.medloop: + faligndata %d0, %d2, %d4 + ldd [%o1], %d0 + subcc %o5, 8, %o5 ! update local count + add %o1, 16, %o1 ! update SRC + std %d4, [%o0] + bz,pn %ncc, 2f + faligndata %d2, %d0, %d6 + ldd [%o1 - 8], %d2 + subcc %o5, 8, %o5 ! update local count + std %d6, [%o0 + 8] + bnz,pt %ncc, .medloop + add %o0, 16, %o0 ! update DST + +1: + faligndata %d0, %d2, %d4 + fmovd %d2, %d0 + std %d4, [%o0] + ba .endmedloop + add %o0, 8, %o0 + +2: + std %d6, [%o0 + 8] + sub %o1, 8, %o1 + add %o0, 16, %o0 + + +.endmedloop: + ! Currently, o1 is pointing to the next double-aligned byte in SRC + ! The 8 bytes starting at [o1-8] are available in d0 + ! At least one, and possibly all, of these need to be written. + + cmp %o2, BLOCK_SIZE + bgu %ncc, .large ! otherwise, less than 16 bytes left + +#if 0 + + /* This code will use partial stores. */ + + mov %g0, %o5 + and %o3, 7, %o3 ! Number of bytes needed to completely + ! fill %d0 with good (unwritten) data. + + subcc %o2, 8, %o2 ! update count (maybe too much) + movl %ncc, %o2, %o5 + addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 + sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) + + bz %ncc, 2f + alignaddr %o3, %g0, %g0 ! set GSR.ALIGN + +1: + deccc %o5 + ldda [%o1]ASI_FL8_P, %d2 + inc %o1 + bgu %ncc, 1b + bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 + +2: + not %o3 + faligndata %d0, %d0, %d0 ! shift bytes to the left + and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] + edge8n %g0, %o3, %o5 + stda %d0, [%o0]%o5, ASI_PST8_P + brlez %o2, .mediumexit + add %o0, %o3, %o0 ! update DST to last stored byte +3: + inc %o0 + deccc %o2 + ldub [%o1], %o3 + stb %o3, [%o0] + bgu %ncc, 3b + inc %o1 + +#else + + andcc %o3, 7, %o5 ! Number of bytes needed to completely + ! fill %d0 with good (unwritten) data. + bz %ncc, 2f + sub %o5, 8, %o3 ! -(number of good bytes in %d0) + cmp %o2, 8 + bl,a %ncc, 3f ! Not enough bytes to fill %d0 + add %o1, %o3, %o1 ! Back up %o1 + +1: + deccc %o5 + ldda [%o1]ASI_FL8_P, %d2 + inc %o1 + bgu %ncc, 1b + bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 + +2: + subcc %o2, 8, %o2 + std %d0, [%o0] + bz %ncc, .mediumexit + add %o0, 8, %o0 +3: + ldub [%o1], %o3 + deccc %o2 + inc %o1 + stb %o3, [%o0] + bgu %ncc, 3b + inc %o0 +#endif + +.mediumexit: + wr %o4, %g0, %fprs ! fprs = o4 restore fprs + retl + mov %g1, %o0 + + + .align ICACHE_LINE_SIZE +.large: + ! The following test for BSTORE_SIZE is used to decide whether + ! to store data with a block store or with individual stores. + ! The block store wins when the amount of data is so large + ! that it is causes other application data to be moved out + ! of the L1 or L2 cache. + ! On a Panther, block store can lose more often because block + ! store forces the stored data to be removed from the L3 cache. + ! + sethi %hi(BSTORE_SIZE),%o5 + or %o5,%lo(BSTORE_SIZE),%o5 + cmp %o2, %o5 + bgu %ncc, .xlarge + + ! %o0 I/O DST is 64-byte aligned + ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) + ! %d0 I/O already loaded with SRC data from [%o1-8] + ! %o2 I/O count (number of bytes that need to be written) + ! %o3 I Not written. If zero, then SRC is double aligned. + ! %o4 I Not written. Holds fprs. + ! %o5 O The number of doubles that remain to be written. + + ! Load the rest of the current block + ! Recall that %o1 is further into SRC than %o0 is into DST + + prefetch [%o0 + (0 * BLOCK_SIZE)], 22 + prefetch [%o0 + (1 * BLOCK_SIZE)], 22 + prefetch [%o0 + (2 * BLOCK_SIZE)], 22 + ldd [%o1], %f2 + prefetch [%o1 + (3 * BLOCK_SIZE)], 21 + ldd [%o1 + 0x8], %f4 + faligndata %f0, %f2, %f32 + ldd [%o1 + 0x10], %f6 + faligndata %f2, %f4, %f34 + ldd [%o1 + 0x18], %f8 + faligndata %f4, %f6, %f36 + ldd [%o1 + 0x20], %f10 + or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 + prefetch [%o1 + (4 * BLOCK_SIZE)], 21 + faligndata %f6, %f8, %f38 + ldd [%o1 + 0x28], %f12 + movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) + faligndata %f8, %f10, %f40 + ldd [%o1 + 0x30], %f14 + faligndata %f10, %f12, %f42 + ldd [%o1 + 0x38], %f0 + sub %o2, BLOCK_SIZE, %o2 ! update count + prefetch [%o1 + (5 * BLOCK_SIZE)], 21 + add %o1, BLOCK_SIZE, %o1 ! update SRC + + ! Main loop. Write previous block. Load rest of current block. + ! Some bytes will be loaded that won't yet be written. +1: + ldd [%o1], %f2 + faligndata %f12, %f14, %f44 + ldd [%o1 + 0x8], %f4 + faligndata %f14, %f0, %f46 + std %f32, [%o0] + std %f34, [%o0+8] + std %f36, [%o0+16] + std %f38, [%o0+24] + std %f40, [%o0+32] + std %f42, [%o0+40] + std %f44, [%o0+48] + std %f46, [%o0+56] + sub %o2, BLOCK_SIZE, %o2 ! update count + prefetch [%o0 + (6 * BLOCK_SIZE)], 22 + prefetch [%o0 + (3 * BLOCK_SIZE)], 22 + add %o0, BLOCK_SIZE, %o0 ! update DST + ldd [%o1 + 0x10], %f6 + faligndata %f0, %f2, %f32 + ldd [%o1 + 0x18], %f8 + faligndata %f2, %f4, %f34 + ldd [%o1 + 0x20], %f10 + faligndata %f4, %f6, %f36 + ldd [%o1 + 0x28], %f12 + faligndata %f6, %f8, %f38 + ldd [%o1 + 0x30], %f14 + faligndata %f8, %f10, %f40 + ldd [%o1 + 0x38], %f0 + faligndata %f10, %f12, %f42 + cmp %o2, BLOCK_SIZE + 8 + prefetch [%o1 + (5 * BLOCK_SIZE)], 21 + bgu,pt %ncc, 1b + add %o1, BLOCK_SIZE, %o1 ! update SRC + faligndata %f12, %f14, %f44 + faligndata %f14, %f0, %f46 + stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache + cmp %o2, BLOCK_SIZE + bne %ncc, 2f ! exactly 1 block remaining? + add %o0, BLOCK_SIZE, %o0 ! update DST + brz,a %o3, 3f ! is SRC double aligned? + ldd [%o1], %f2 + +2: + add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 + add %o5, %o3, %o5 + + membar #StoreLoad|#StoreStore + + ba .beginmedloop + andn %o5, 7, %o5 ! 8 byte aligned count + + + ! This is when there is exactly 1 block remaining and SRC is aligned +3: + ldd [%o1 + 0x8], %f4 + ldd [%o1 + 0x10], %f6 + fsrc1 %f0, %f32 + ldd [%o1 + 0x18], %f8 + fsrc1 %f2, %f34 + ldd [%o1 + 0x20], %f10 + fsrc1 %f4, %f36 + ldd [%o1 + 0x28], %f12 + fsrc1 %f6, %f38 + ldd [%o1 + 0x30], %f14 + fsrc1 %f8, %f40 + fsrc1 %f10, %f42 + fsrc1 %f12, %f44 + fsrc1 %f14, %f46 + stda %f32, [%o0]ASI_BLK_P + membar #StoreLoad|#StoreStore + wr %o4, 0, %fprs + retl + mov %g1, %o0 + + + .align 16 + ! two nops here causes loop starting at 1f below to be + ! on a cache line boundary, improving performance + nop + nop +.xlarge: + ! %o0 I/O DST is 64-byte aligned + ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) + ! %d0 I/O already loaded with SRC data from [%o1-8] + ! %o2 I/O count (number of bytes that need to be written) + ! %o3 I Not written. If zero, then SRC is double aligned. + ! %o4 I Not written. Holds fprs. + ! %o5 O The number of doubles that remain to be written. + + ! Load the rest of the current block + ! Recall that %o1 is further into SRC than %o0 is into DST + + ! prefetch [%o1 + (3 * BLOCK_SIZE)], 21 + ! executed in delay slot for branch to .xlarge + prefetch [%o1 + (4 * BLOCK_SIZE)], 21 + prefetch [%o1 + (5 * BLOCK_SIZE)], 21 + ldd [%o1], %f2 + prefetch [%o1 + (6 * BLOCK_SIZE)], 21 + ldd [%o1 + 0x8], %f4 + faligndata %f0, %f2, %f32 + ldd [%o1 + 0x10], %f6 + faligndata %f2, %f4, %f34 + ldd [%o1 + 0x18], %f8 + faligndata %f4, %f6, %f36 + ldd [%o1 + 0x20], %f10 + or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 + faligndata %f6, %f8, %f38 + ldd [%o1 + 0x28], %f12 + movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) + faligndata %f8, %f10, %f40 + ldd [%o1 + 0x30], %f14 + faligndata %f10, %f12, %f42 + ldd [%o1 + 0x38], %f0 + sub %o2, BLOCK_SIZE, %o2 ! update count + prefetch [%o1 + (7 * BLOCK_SIZE)], 21 + add %o1, BLOCK_SIZE, %o1 ! update SRC + + ! This point is 32-byte aligned since 24 instructions appear since + ! the previous alignment directive. + + + ! Main loop. Write previous block. Load rest of current block. + ! Some bytes will be loaded that won't yet be written. +1: + ldd [%o1], %f2 + faligndata %f12, %f14, %f44 + ldd [%o1 + 0x8], %f4 + faligndata %f14, %f0, %f46 + stda %f32, [%o0]ASI_BLK_P + sub %o2, BLOCK_SIZE, %o2 ! update count + ldd [%o1 + 0x10], %f6 + faligndata %f0, %f2, %f32 + ldd [%o1 + 0x18], %f8 + faligndata %f2, %f4, %f34 + ldd [%o1 + 0x20], %f10 + faligndata %f4, %f6, %f36 + ldd [%o1 + 0x28], %f12 + faligndata %f6, %f8, %f38 + ldd [%o1 + 0x30], %f14 + faligndata %f8, %f10, %f40 + ldd [%o1 + 0x38], %f0 + faligndata %f10, %f12, %f42 + ! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K + prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21 + add %o0, BLOCK_SIZE, %o0 ! update DST + cmp %o2, BLOCK_SIZE + 8 + ! second prefetch important to correct for occasional dropped + ! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K + ! strong prefetch prevents drops on Panther, but Jaguar and earlier + ! US-III models treat strong prefetches as weak prefetchs + ! to avoid regressions on customer hardware, we retain the prefetch + prefetch [%o1 + (5 * BLOCK_SIZE)], 21 + bgu,pt %ncc, 1b + add %o1, BLOCK_SIZE, %o1 ! update SRC + + faligndata %f12, %f14, %f44 + faligndata %f14, %f0, %f46 + stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache + cmp %o2, BLOCK_SIZE + bne %ncc, 2f ! exactly 1 block remaining? + add %o0, BLOCK_SIZE, %o0 ! update DST + brz,a %o3, 3f ! is SRC double aligned? + ldd [%o1], %f2 + +2: + add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 + add %o5, %o3, %o5 + + membar #StoreLoad|#StoreStore + + ba .beginmedloop + andn %o5, 7, %o5 ! 8 byte aligned count + + + ! This is when there is exactly 1 block remaining and SRC is aligned +3: + ldd [%o1 + 0x8], %f4 + ldd [%o1 + 0x10], %f6 + fsrc1 %f0, %f32 + ldd [%o1 + 0x18], %f8 + fsrc1 %f2, %f34 + ldd [%o1 + 0x20], %f10 + fsrc1 %f4, %f36 + ldd [%o1 + 0x28], %f12 + fsrc1 %f6, %f38 + ldd [%o1 + 0x30], %f14 + fsrc1 %f8, %f40 + fsrc1 %f10, %f42 + fsrc1 %f12, %f44 + fsrc1 %f14, %f46 + stda %f32, [%o0]ASI_BLK_P + membar #StoreLoad|#StoreStore + wr %o4, 0, %fprs + retl + mov %g1, %o0 + + SET_SIZE(memcpy) |