/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ .ident "%Z%%M% %I% %E% SMI" .file "memcpy.s" /* * memcpy(s1, s2, len) * * Copy s2 to s1, always copy n bytes. * Note: this does not work for overlapped copies, bcopy() does * * Fast assembler language version of the following C-program for memcpy * which represents the `standard' for the C-library. * * void * * memcpy(void *s, const void *s0, size_t n) * { * if (n != 0) { * char *s1 = s; * const char *s2 = s0; * do { * *s1++ = *s2++; * } while (--n != 0); * } * return ( s ); * } * * Flow : * * if (count < 17) { * Do the byte copy * Return destination address * } * if (count < 128) { * Is source aligned on word boundary * If no then align source on word boundary then goto .ald * If yes goto .ald * .ald: * Is destination aligned on word boundary * Depending on destination offset (last 2 bits of destination) * copy data by shifting and merging. * Copy residue bytes as byte copy * Return destination address * } else { * Align destination on block boundary * Depending on the source offset (last 4 bits of source address) align * the data and store to destination. Both the load and store are done * using ASI_BLK_INIT_ST_QUAD_LDD_P. * For remaining count copy as much data in 8-byte chunk from source to * destination. * Followed by trailing copy using byte copy. * Return saved destination address * } * */ #include #include #include #include #ifdef NIAGARA2_IMPL #include #define ALIGN_OFF_1_7 \ faligndata %d0, %d2, %d48 ;\ faligndata %d2, %d4, %d50 ;\ faligndata %d4, %d6, %d52 ;\ faligndata %d6, %d8, %d54 ;\ faligndata %d8, %d10, %d56 ;\ faligndata %d10, %d12, %d58 ;\ faligndata %d12, %d14, %d60 ;\ faligndata %d14, %d16, %d62 #define ALIGN_OFF_8_15 \ faligndata %d2, %d4, %d48 ;\ faligndata %d4, %d6, %d50 ;\ faligndata %d6, %d8, %d52 ;\ faligndata %d8, %d10, %d54 ;\ faligndata %d10, %d12, %d56 ;\ faligndata %d12, %d14, %d58 ;\ faligndata %d14, %d16, %d60 ;\ faligndata %d16, %d18, %d62 #define ALIGN_OFF_16_23 \ faligndata %d4, %d6, %d48 ;\ faligndata %d6, %d8, %d50 ;\ faligndata %d8, %d10, %d52 ;\ faligndata %d10, %d12, %d54 ;\ faligndata %d12, %d14, %d56 ;\ faligndata %d14, %d16, %d58 ;\ faligndata %d16, %d18, %d60 ;\ faligndata %d18, %d20, %d62 #define ALIGN_OFF_24_31 \ faligndata %d6, %d8, %d48 ;\ faligndata %d8, %d10, %d50 ;\ faligndata %d10, %d12, %d52 ;\ faligndata %d12, %d14, %d54 ;\ faligndata %d14, %d16, %d56 ;\ faligndata %d16, %d18, %d58 ;\ faligndata %d18, %d20, %d60 ;\ faligndata %d20, %d22, %d62 #define ALIGN_OFF_32_39 \ faligndata %d8, %d10, %d48 ;\ faligndata %d10, %d12, %d50 ;\ faligndata %d12, %d14, %d52 ;\ faligndata %d14, %d16, %d54 ;\ faligndata %d16, %d18, %d56 ;\ faligndata %d18, %d20, %d58 ;\ faligndata %d20, %d22, %d60 ;\ faligndata %d22, %d24, %d62 #define ALIGN_OFF_40_47 \ faligndata %d10, %d12, %d48 ;\ faligndata %d12, %d14, %d50 ;\ faligndata %d14, %d16, %d52 ;\ faligndata %d16, %d18, %d54 ;\ faligndata %d18, %d20, %d56 ;\ faligndata %d20, %d22, %d58 ;\ faligndata %d22, %d24, %d60 ;\ faligndata %d24, %d26, %d62 #define ALIGN_OFF_48_55 \ faligndata %d12, %d14, %d48 ;\ faligndata %d14, %d16, %d50 ;\ faligndata %d16, %d18, %d52 ;\ faligndata %d18, %d20, %d54 ;\ faligndata %d20, %d22, %d56 ;\ faligndata %d22, %d24, %d58 ;\ faligndata %d24, %d26, %d60 ;\ faligndata %d26, %d28, %d62 #define ALIGN_OFF_56_63 \ faligndata %d14, %d16, %d48 ;\ faligndata %d16, %d18, %d50 ;\ faligndata %d18, %d20, %d52 ;\ faligndata %d20, %d22, %d54 ;\ faligndata %d22, %d24, %d56 ;\ faligndata %d24, %d26, %d58 ;\ faligndata %d26, %d28, %d60 ;\ faligndata %d28, %d30, %d62 #else /* NIAGARA2_IMPL */ /* * This define is to align data for the unaligned source cases. * The data1, data2 and data3 is merged into data1 and data2. * The data3 is preserved for next merge. */ #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ sllx data1, lshift, data1 ;\ srlx data2, rshift, tmp ;\ or data1, tmp, data1 ;\ sllx data2, lshift, data2 ;\ srlx data3, rshift, tmp ;\ or data2, tmp, data2 /* * Align the data. Merge the data1 and data2 into data1. */ #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ sllx data1, lshift, data1 ;\ srlx data2, rshift, tmp ;\ or data1, tmp, data1 #endif /* NIAGARA2_IMPL */ /* * Align the data in case of backward copy. */ #define ALIGN_DATA_BC(data1, data2, rshift, lshift, tmp) \ srlx data1, rshift, data1 ;\ sllx data2, lshift, tmp ;\ or data1, tmp, data1 ANSI_PRAGMA_WEAK(memmove,function) ANSI_PRAGMA_WEAK(memcpy,function) #include "synonyms.h" ENTRY(memmove) cmp %o1, %o0 ! if from address is >= to use forward copy bgeu %ncc, forcpy ! else use backward if ... sub %o0, %o1, %o4 ! get difference of two addresses cmp %o2, %o4 ! compare size and difference of addresses bleu %ncc, forcpy ! if size is bigger, do overlapped copy nop ! ! an overlapped copy that must be done "backwards" ! .ovbc: mov %o0, %o5 ! save des address for return val add %o1, %o2, %o1 ! get to end of source space add %o0, %o2, %o0 ! get to end of destination space .chksize: cmp %o2, 0x20 bgu,pn %ncc, .dbalign nop .bytecp: tst %o2 bleu,a,pn %ncc, exitovbc nop 1: dec %o0 ! decrement to address dec %o1 ! decrement from address ldub [%o1], %o4 deccc %o2 bgu,pt %ncc, 1b stb %o4, [%o0] exitovbc: retl mov %o5, %o0 .dbalign: andcc %o0, 7, %o3 bz %ncc, .dbbck nop ! %o3 has bytes till dst 8 bytes aligned sub %o2, %o3, %o2 ! update o2 with new count 2: dec %o1 dec %o0 ldub [%o1], %o4 deccc %o3 bgu,pt %ncc, 2b stb %o4, [%o0] ! Now Destination is 8 byte aligned .dbbck: save %sp, -SA(MINFRAME), %sp andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size sub %i2, %i3, %i2 ! Residue bytes in %i2 andcc %i1, 7, %g1 ! is src aligned on 8 bytes ! %g1 has src offset bz %ncc, .dbcopybc nop sll %g1, 3, %o1 ! left shift mov 0x40, %g5 sub %g5, %o1, %g5 ! right shift = (64 - left shift) .cpy_dbwdbc: sub %i1, %g1, %i1 ! align the src at 8 bytes. ldx [%i1], %o2 2: sub %i0, 0x8, %i0 ldx [%i1-0x8], %o4 ! we are at the end ALIGN_DATA_BC(%o2, %o4, %g5, %o1, %o3) stx %o2, [%i0] mov %o4, %o2 subcc %i3, 0x8, %i3 bgu,pt %ncc, 2b sub %i1, 0x8, %i1 ba .bytebc add %i1, %g1, %i1 .dbcopybc: sub %i1, 8, %i1 sub %i0, 8, %i0 ! we are at the end ldx [%i1], %o2 stx %o2, [%i0] subcc %i3, 0x8, %i3 bgu,pt %ncc, .dbcopybc nop .bytebc: tst %i2 bleu,a,pn %ncc, exitbc nop 1: dec %i0 ! decrement to address dec %i1 ! decrement from address ldub [%i1], %i4 deccc %i2 bgu,pt %ncc, 1b stb %i4, [%i0] exitbc: ret restore %i5, %g0, %o0 SET_SIZE(memmove) ENTRY(memcpy) ENTRY(__align_cpy_1) forcpy: mov %o0, %g5 ! save des address for return val cmp %o2, 17 ! for small counts copy bytes bleu,pt %ncc, .dbytecp nop cmp %o2, 0x80 ! For lengths less than 128 bytes no bleu,pn %ncc, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P /* * Make sure that source and destination buffers are 64 bytes apart. * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy * the data. */ subcc %o1, %o0, %o3 blu %ncc, .blkalgndst cmp %o3, 0x40 ! if src - dst >= 0x40 bgeu,pt %ncc, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P .no_blkcpy: andcc %o1, 3, %o5 ! is src word aligned bz,pn %ncc, .aldst cmp %o5, 2 ! is src half-word aligned be,pt %ncc, .s2algn cmp %o5, 3 ! src is byte aligned .s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it inc 1, %o1 stb %o3, [%g5] ! move a byte to align src inc 1, %g5 bne,pt %ncc, .s2algn dec %o2 b .ald ! now go align dest andcc %g5, 3, %o5 .s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged inc 2, %o1 srl %o3, 8, %o4 stb %o4, [%g5] ! have to do bytes, stb %o3, [%g5 + 1] ! don't know dst alingment inc 2, %g5 dec 2, %o2 .aldst: andcc %g5, 3, %o5 ! align the destination address .ald: bz,pn %ncc, .w4cp cmp %o5, 2 bz,pn %ncc, .w2cp cmp %o5, 3 .w3cp: lduw [%o1], %o4 inc 4, %o1 srl %o4, 24, %o5 stb %o5, [%g5] bne,pt %ncc, .w1cp inc %g5 dec 1, %o2 andn %o2, 3, %o3 ! o3 is aligned word count dec 4, %o3 ! avoid reading beyond tail of src sub %o1, %g5, %o1 ! o1 gets the difference 1: sll %o4, 8, %g1 ! save residual bytes lduw [%o1+%g5], %o4 deccc 4, %o3 srl %o4, 24, %o5 ! merge with residual or %o5, %g1, %g1 st %g1, [%g5] bnz,pt %ncc, 1b inc 4, %g5 sub %o1, 3, %o1 ! used one byte of last word read and %o2, 3, %o2 b 7f inc 4, %o2 .w1cp: srl %o4, 8, %o5 sth %o5, [%g5] inc 2, %g5 dec 3, %o2 andn %o2, 3, %o3 ! o3 is aligned word count dec 4, %o3 ! avoid reading beyond tail of src sub %o1, %g5, %o1 ! o1 gets the difference 2: sll %o4, 24, %g1 ! save residual bytes lduw [%o1+%g5], %o4 deccc 4, %o3 srl %o4, 8, %o5 ! merge with residual or %o5, %g1, %g1 st %g1, [%g5] bnz,pt %ncc, 2b inc 4, %g5 sub %o1, 1, %o1 ! used three bytes of last word read and %o2, 3, %o2 b 7f inc 4, %o2 .w2cp: lduw [%o1], %o4 inc 4, %o1 srl %o4, 16, %o5 sth %o5, [%g5] inc 2, %g5 dec 2, %o2 andn %o2, 3, %o3 ! o3 is aligned word count dec 4, %o3 ! avoid reading beyond tail of src sub %o1, %g5, %o1 ! o1 gets the difference 3: sll %o4, 16, %g1 ! save residual bytes lduw [%o1+%g5], %o4 deccc 4, %o3 srl %o4, 16, %o5 ! merge with residual or %o5, %g1, %g1 st %g1, [%g5] bnz,pt %ncc, 3b inc 4, %g5 sub %o1, 2, %o1 ! used two bytes of last word read and %o2, 3, %o2 b 7f inc 4, %o2 .w4cp: andn %o2, 3, %o3 ! o3 is aligned word count sub %o1, %g5, %o1 ! o1 gets the difference 1: lduw [%o1+%g5], %o4 ! read from address deccc 4, %o3 ! decrement count st %o4, [%g5] ! write at destination address bgu,pt %ncc, 1b inc 4, %g5 ! increment to address b 7f and %o2, 3, %o2 ! number of leftover bytes, if any ! ! differenced byte copy, works with any alignment ! .dbytecp: b 7f sub %o1, %g5, %o1 ! o1 gets the difference 4: stb %o4, [%g5] ! write to address inc %g5 ! inc to address 7: deccc %o2 ! decrement count bgeu,a,pt %ncc,4b ! loop till done ldub [%o1+%g5], %o4 ! read from address retl ! %o0 was preserved nop .blkalgndst: save %sp, -SA(MINFRAME), %sp #ifdef NIAGARA2_IMPL rd %fprs, %l7 ! save orig %fprs into %l7 ! if fprs.fef == 0, set it. Checking it, reqires 2 instructions. ! So set it anyway, without checking. wr %g0, 0x4, %fprs ! fprs.fef = 1 #endif /* NIAGARA2_IMPL */ ! Block (64 bytes) align the destination. andcc %i0, 0x3f, %i3 ! is dst block aligned bz %ncc, .chksrc ! dst already block aligned sub %i3, 0x40, %i3 neg %i3 ! bytes till dst 64 bytes aligned sub %i2, %i3, %i2 ! update i2 with new count ! Based on source and destination alignment do ! either 8 bytes, 4 bytes, 2 bytes or byte copy. ! Is dst & src 8B aligned or %i0, %i1, %o2 andcc %o2, 0x7, %g0 bz %ncc, .alewdcp nop ! Is dst & src 4B aligned andcc %o2, 0x3, %g0 bz %ncc, .alwdcp nop ! Is dst & src 2B aligned andcc %o2, 0x1, %g0 bz %ncc, .alhlfwdcp nop ! 1B aligned 1: ldub [%i1], %o2 stb %o2, [%i0] inc %i1 deccc %i3 bgu,pt %ncc, 1b inc %i0 ba .chksrc nop ! dst & src 4B aligned .alwdcp: ld [%i1], %o2 st %o2, [%i0] add %i1, 0x4, %i1 subcc %i3, 0x4, %i3 bgu,pt %ncc, .alwdcp add %i0, 0x4, %i0 ba .chksrc nop ! dst & src 2B aligned .alhlfwdcp: lduh [%i1], %o2 stuh %o2, [%i0] add %i1, 0x2, %i1 subcc %i3, 0x2, %i3 bgu,pt %ncc, .alhlfwdcp add %i0, 0x2, %i0 ba .chksrc nop ! dst & src 8B aligned .alewdcp: ldx [%i1], %o2 stx %o2, [%i0] add %i1, 0x8, %i1 subcc %i3, 0x8, %i3 bgu,pt %ncc, .alewdcp add %i0, 0x8, %i0 ! Now Destination is block (64 bytes) aligned .chksrc: andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size sub %i2, %i3, %i2 ! Residue bytes in %i2 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi #ifdef NIAGARA2_IMPL andn %i1, 0x3f, %l0 ! %l0 has block aligned src address prefetch [%l0+0x0], #one_read andcc %i1, 0x3f, %g0 ! is src 64B aligned bz,pn %ncc, .blkcpy nop ! handle misaligned source cases alignaddr %i1, %g0, %g0 ! generate %gsr srl %i1, 0x3, %l1 ! src add bits 3, 4, 5 are now least ! significant in %l1 andcc %l1, 0x7, %l2 ! mask everything except bits 1, 2, 3 add %i1, %i3, %i1 ! switch statement to get to right 8 byte block within ! 64 byte block cmp %l2, 0x4 bgeu,a hlf cmp %l2, 0x6 cmp %l2, 0x2 bgeu,a sqtr nop cmp %l2, 0x1 be,a off15 nop ba off7 nop sqtr: be,a off23 nop ba,a off31 nop hlf: bgeu,a fqtr nop cmp %l2, 0x5 be,a off47 nop ba off39 nop fqtr: be,a off55 nop ldd [%l0+0x38], %d14 prefetch [%l0+0x40], #one_read prefetch [%l0+0x80], #one_read 7: add %l0, 0x40, %l0 stxa %g0, [%i0]%asi ! initialize the cache line ldda [%l0]ASI_BLK_P, %d16 ALIGN_OFF_56_63 fmovd %d30, %d14 stda %d48, [%i0]ASI_BLK_P subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 7b prefetch [%l0+0x80], #one_read ba .blkdone membar #Sync off7: ldda [%l0]ASI_BLK_P, %d0 prefetch [%l0+0x40], #one_read prefetch [%l0+0x80], #one_read 0: add %l0, 0x40, %l0 stxa %g0, [%i0]%asi ! initialize the cache line ldda [%l0]ASI_BLK_P, %d16 ALIGN_OFF_1_7 fmovd %d16, %d0 fmovd %d18, %d2 fmovd %d20, %d4 fmovd %d22, %d6 fmovd %d24, %d8 fmovd %d26, %d10 fmovd %d28, %d12 fmovd %d30, %d14 stda %d48, [%i0]ASI_BLK_P subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 0b prefetch [%l0+0x80], #one_read ba .blkdone membar #Sync off15: ldd [%l0+0x8], %d2 ldd [%l0+0x10], %d4 ldd [%l0+0x18], %d6 ldd [%l0+0x20], %d8 ldd [%l0+0x28], %d10 ldd [%l0+0x30], %d12 ldd [%l0+0x38], %d14 prefetch [%l0+0x40], #one_read prefetch [%l0+0x80], #one_read 1: add %l0, 0x40, %l0 stxa %g0, [%i0]%asi ! initialize the cache line ldda [%l0]ASI_BLK_P, %d16 ALIGN_OFF_8_15 fmovd %d18, %d2 fmovd %d20, %d4 fmovd %d22, %d6 fmovd %d24, %d8 fmovd %d26, %d10 fmovd %d28, %d12 fmovd %d30, %d14 stda %d48, [%i0]ASI_BLK_P subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 1b prefetch [%l0+0x80], #one_read ba .blkdone membar #Sync off23: ldd [%l0+0x10], %d4 ldd [%l0+0x18], %d6 ldd [%l0+0x20], %d8 ldd [%l0+0x28], %d10 ldd [%l0+0x30], %d12 ldd [%l0+0x38], %d14 prefetch [%l0+0x40], #one_read prefetch [%l0+0x80], #one_read 2: add %l0, 0x40, %l0 stxa %g0, [%i0]%asi ! initialize the cache line ldda [%l0]ASI_BLK_P, %d16 ALIGN_OFF_16_23 fmovd %d20, %d4 fmovd %d22, %d6 fmovd %d24, %d8 fmovd %d26, %d10 fmovd %d28, %d12 fmovd %d30, %d14 stda %d48, [%i0]ASI_BLK_P subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 2b prefetch [%l0+0x80], #one_read ba .blkdone membar #Sync off31: ldd [%l0+0x18], %d6 ldd [%l0+0x20], %d8 ldd [%l0+0x28], %d10 ldd [%l0+0x30], %d12 ldd [%l0+0x38], %d14 prefetch [%l0+0x40], #one_read prefetch [%l0+0x80], #one_read 3: add %l0, 0x40, %l0 stxa %g0, [%i0]%asi ! initialize the cache line ldda [%l0]ASI_BLK_P, %d16 ALIGN_OFF_24_31 fmovd %d22, %d6 fmovd %d24, %d8 fmovd %d26, %d10 fmovd %d28, %d12 fmovd %d30, %d14 stda %d48, [%i0]ASI_BLK_P subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 3b prefetch [%l0+0x80], #one_read ba .blkdone membar #Sync off39: ldd [%l0+0x20], %d8 ldd [%l0+0x28], %d10 ldd [%l0+0x30], %d12 ldd [%l0+0x38], %d14 prefetch [%l0+0x40], #one_read prefetch [%l0+0x80], #one_read 4: add %l0, 0x40, %l0 stxa %g0, [%i0]%asi ! initialize the cache line ldda [%l0]ASI_BLK_P, %d16 ALIGN_OFF_32_39 fmovd %d24, %d8 fmovd %d26, %d10 fmovd %d28, %d12 fmovd %d30, %d14 stda %d48, [%i0]ASI_BLK_P subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 4b prefetch [%l0+0x80], #one_read ba .blkdone membar #Sync off47: ldd [%l0+0x28], %d10 ldd [%l0+0x30], %d12 ldd [%l0+0x38], %d14 prefetch [%l0+0x40], #one_read prefetch [%l0+0x80], #one_read 5: add %l0, 0x40, %l0 stxa %g0, [%i0]%asi ! initialize the cache line ldda [%l0]ASI_BLK_P, %d16 ALIGN_OFF_40_47 fmovd %d26, %d10 fmovd %d28, %d12 fmovd %d30, %d14 stda %d48, [%i0]ASI_BLK_P subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 5b prefetch [%l0+0x80], #one_read ba .blkdone membar #Sync off55: ldd [%l0+0x30], %d12 ldd [%l0+0x38], %d14 prefetch [%l0+0x40], #one_read prefetch [%l0+0x80], #one_read 6: add %l0, 0x40, %l0 stxa %g0, [%i0]%asi ! initialize the cache line ldda [%l0]ASI_BLK_P, %d16 ALIGN_OFF_48_55 fmovd %d28, %d12 fmovd %d30, %d14 stda %d48, [%i0]ASI_BLK_P subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 6b prefetch [%l0+0x80], #one_read ba .blkdone membar #Sync .blkcpy: prefetch [%i1+0x40], #one_read prefetch [%i1+0x80], #one_read 8: stxa %g0, [%i0]%asi ! initialize the cache line ldda [%i1]ASI_BLK_P, %d0 stda %d0, [%i0]ASI_BLK_P add %i1, 0x40, %i1 subcc %i3, 0x40, %i3 add %i0, 0x40, %i0 bgu,pt %ncc, 8b prefetch [%i1+0x80], #one_read membar #Sync .blkdone: #else /* NIAGARA2_IMPL */ andcc %i1, 0xf, %l1 ! is src quadword aligned bz,pn %ncc, .blkcpy ! src offset in %l1 nop cmp %l1, 0x8 bgu %ncc, .cpy_upper_double nop blu %ncc, .cpy_lower_double nop ! Falls through when source offset is equal to 8 i.e. ! source is double word aligned. ! In this case no shift/merge of data is required sub %i1, %l1, %i1 ! align the src at 16 bytes. andn %i1, 0x3f, %o0 ! %o0 has block aligned source prefetch [%o0+0x0], #one_read ldda [%i1+0x0]%asi, %o2 loop0: ldda [%i1+0x10]%asi, %o4 prefetch [%o0+0x40], #one_read stxa %o3, [%i0+0x0]%asi stxa %o4, [%i0+0x8]%asi ldda [%i1+0x20]%asi, %o2 stxa %o5, [%i0+0x10]%asi stxa %o2, [%i0+0x18]%asi ldda [%i1+0x30]%asi, %o4 stxa %o3, [%i0+0x20]%asi stxa %o4, [%i0+0x28]%asi ldda [%i1+0x40]%asi, %o2 stxa %o5, [%i0+0x30]%asi stxa %o2, [%i0+0x38]%asi add %o0, 0x40, %o0 add %i1, 0x40, %i1 subcc %i3, 0x40, %i3 bgu,pt %ncc, loop0 add %i0, 0x40, %i0 ba .blkdone add %i1, %l1, %i1 ! increment the source by src offset .cpy_lower_double: sub %i1, %l1, %i1 ! align the src at 16 bytes. sll %l1, 3, %l2 ! %l2 left shift mov 0x40, %l3 sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) andn %i1, 0x3f, %o0 ! %o0 has block aligned source prefetch [%o0+0x0], #one_read ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has ! complete data loop1: ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read. ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4 ! into %o2 and %o3 prefetch [%o0+0x40], #one_read stxa %o2, [%i0+0x0]%asi stxa %o3, [%i0+0x8]%asi ldda [%i1+0x20]%asi, %o2 ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and stxa %o4, [%i0+0x10]%asi ! %o4 from previous read stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5 ! Repeat the same for next 32 bytes. ldda [%i1+0x30]%asi, %o4 ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) stxa %o2, [%i0+0x20]%asi stxa %o3, [%i0+0x28]%asi ldda [%i1+0x40]%asi, %o2 ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) stxa %o4, [%i0+0x30]%asi stxa %o5, [%i0+0x38]%asi add %o0, 0x40, %o0 add %i1, 0x40, %i1 subcc %i3, 0x40, %i3 bgu,pt %ncc, loop1 add %i0, 0x40, %i0 ba .blkdone add %i1, %l1, %i1 ! increment the source by src offset .cpy_upper_double: sub %i1, %l1, %i1 ! align the src at 16 bytes. mov 0x8, %l2 sub %l1, %l2, %l2 sll %l2, 3, %l2 ! %l2 left shift mov 0x40, %l3 sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) andn %i1, 0x3f, %o0 ! %o0 has block aligned source prefetch [%o0+0x0], #one_read ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and ! no data in %o2 loop2: ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has ! partial ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5 ! into %o3 and %o4 prefetch [%o0+0x40], #one_read stxa %o3, [%i0+0x0]%asi stxa %o4, [%i0+0x8]%asi ldda [%i1+0x20]%asi, %o2 ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with stxa %o5, [%i0+0x10]%asi ! %o5 from previous read stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2 ! Repeat the same for next 32 bytes. ldda [%i1+0x30]%asi, %o4 ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) stxa %o3, [%i0+0x20]%asi stxa %o4, [%i0+0x28]%asi ldda [%i1+0x40]%asi, %o2 ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) stxa %o5, [%i0+0x30]%asi stxa %o2, [%i0+0x38]%asi add %o0, 0x40, %o0 add %i1, 0x40, %i1 subcc %i3, 0x40, %i3 bgu,pt %ncc, loop2 add %i0, 0x40, %i0 ba .blkdone add %i1, %l1, %i1 ! increment the source by src offset ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P .blkcpy: andn %i1, 0x3f, %o0 ! %o0 has block aligned source prefetch [%o0+0x0], #one_read 1: prefetch [%o0+0x40], #one_read ldda [%i1+0x0]%asi, %o2 ldda [%i1+0x10]%asi, %o4 stxa %o2, [%i0+0x0]%asi stxa %o3, [%i0+0x8]%asi stxa %o4, [%i0+0x10]%asi stxa %o5, [%i0+0x18]%asi ldda [%i1+0x20]%asi, %o2 ldda [%i1+0x30]%asi, %o4 stxa %o2, [%i0+0x20]%asi stxa %o3, [%i0+0x28]%asi stxa %o4, [%i0+0x30]%asi stxa %o5, [%i0+0x38]%asi add %o0, 0x40, %o0 add %i1, 0x40, %i1 subcc %i3, 0x40, %i3 bgu,pt %ncc, 1b add %i0, 0x40, %i0 .blkdone: membar #Sync #endif /* NIAGARA2_IMPL */ mov ASI_PNF, %asi ! restore %asi to default ! ASI_PRIMARY_NOFAULT value tst %i2 bz,pt %ncc, .blkexit nop ! Handle trailing bytes cmp %i2, 0x8 blu,pt %ncc, .residue nop ! Can we do some 8B ops or %i1, %i0, %o2 andcc %o2, 0x7, %g0 bnz %ncc, .last4 nop ! Do 8byte ops as long as possible .last8: ldx [%i1], %o2 stx %o2, [%i0] add %i1, 0x8, %i1 sub %i2, 0x8, %i2 cmp %i2, 0x8 bgu,pt %ncc, .last8 add %i0, 0x8, %i0 tst %i2 bz,pt %ncc, .blkexit nop ba .residue nop .last4: ! Can we do 4B ops andcc %o2, 0x3, %g0 bnz %ncc, .last2 nop 1: ld [%i1], %o2 st %o2, [%i0] add %i1, 0x4, %i1 sub %i2, 0x4, %i2 cmp %i2, 0x4 bgu,pt %ncc, 1b add %i0, 0x4, %i0 cmp %i2, 0 bz,pt %ncc, .blkexit nop ba .residue nop .last2: ! Can we do 2B ops andcc %o2, 0x1, %g0 bnz %ncc, .residue nop 1: lduh [%i1], %o2 stuh %o2, [%i0] add %i1, 0x2, %i1 sub %i2, 0x2, %i2 cmp %i2, 0x2 bgu,pt %ncc, 1b add %i0, 0x2, %i0 cmp %i2, 0 bz,pt %ncc, .blkexit nop .residue: ldub [%i1], %o2 stb %o2, [%i0] inc %i1 deccc %i2 bgu,pt %ncc, .residue inc %i0 .blkexit: #ifdef NIAGARA2_IMPL and %l7, 0x4, %l7 ! fprs.du = fprs.dl = 0 wr %l7, %g0, %fprs ! fprs = %l7 - restore fprs.fef #endif /* NIAGARA2_IMPL */ ret restore %g5, %g0, %o0 SET_SIZE(memcpy) SET_SIZE(__align_cpy_1)