@ file core_asm.s @ core asm routines @ author cearn @ Modified by Legolas for fpc4gba use @ @ === NOTES === @ * 20050924: Lower overhead for all; reduced i-count for u16 loops. @ * These are 16/32bit memset and memcpy. The 32bit versions are in @ iwram for maximum effect and pretty much do what CpuFastSet does, @ except that it'll work for non multiples of 8 words too. Speed @ is as good as CpuFastSet, but with a little less overhead. @ * The 16bit versions call the 32bit ones if possible and/or desirable. @ They are thumb/ROM functions but did them in asm anyway because @ GCC goes haywire with the use of registers resulting in a much @ higher overhead (i.e., detrimental for low counts) @ * Crossover with inline while(nn--) loops (not for(ii++), which are @ much slower): @ memcpy32: ~4 @ memset32: ~5 @ memcpy16: ~8 @ memset16: ~8 .file "core_asm.as" @ === procedure memcpy32(dest: pointer; const src: pointer; wcount: u32); ====== @ Fast-copy by words. @ param dest Destination address. @ param src Source address. @ param wcount Number of words. @ note: src and dst must be word aligned. @ note: r0 and r1 return as dst + wdn and src + wdn. @ Reglist: @ r0, r1: dst, src @ r2: wcount, then wcount>>3 @ r3-r10: data buffer @ r12: wcount&7 .text @ ?!?!? @ .section .iwram,"ax", %progbits .align 2 .code 32 .global memcpy32 memcpy32: and r12, r2, #7 movs r2, r2, lsr #3 beq .Lres_cpy32 stmfd sp!, {r4-r10} @ copy 32byte chunks with 8fold xxmia .Lmain_cpy32: ldmia r1!, {r3-r10} stmia r0!, {r3-r10} subs r2, r2, #1 bhi .Lmain_cpy32 ldmfd sp!, {r4-r10} @ and the residual 0-7 words .Lres_cpy32: subs r12, r12, #1 ldmcsia r1!, {r3} stmcsia r0!, {r3} bcs .Lres_cpy32 bx lr @ === procedure memset32(dest: pointer; wd: u32; wcount: u32); ================= @ Fast-fill by words. @ param dest Destination address. @ param src Fill word (not address). @ param wcount Number of words to fill. @ note: dst must be word aligned. @ note: r0 returns as dst + wcount. @ Reglist: @ r0, r1: dst, src @ r2: wcount, then wcount>>3 @ r3-r10: data buffer @ r12: wcount&7 .text @?!?!? @ .section .iwram,"ax", %progbits .align 2 .code 32 .global memset32 memset32: and r12, r2, #7 movs r2, r2, lsr #3 beq .Lres_set32 stmfd sp!, {r4-r10} @ set 32byte chunks with 8fold xxmia mov r3, r1 mov r4, r1 mov r5, r1 mov r6, r1 mov r7, r1 mov r8, r1 mov r9, r1 mov r10, r1 .Lmain_set32: stmia r0!, {r3-r10} subs r2, r2, #1 bhi .Lmain_set32 ldmfd sp!, {r4-r10} @ residual 0-7 words .Lres_set32: subs r12, r12, #1 stmcsia r0!, {r1} bcs .Lres_set32 bx lr @ === procedure memcpy16(dest: pointer; const src: pointer; hwcount: u32); ===== @ Copy for halfwords. @ Uses memcpy32() if hwcount>6 and src and dst are aligned equally. @ param dest Destination address. @ param src Source address. @ param hwcount Number of halfwords to fill. @ note: dst and src must be halfword aligned. @ note: r0 and r1 return as dst + hwcount and src + hwcount. @ Reglist: @ r0, r1: dst, src @ r2, r4: hwcount @ r3: tmp; and data buffer .text .align 2 .code 16 .global memcpy16 .thumb_func memcpy16: push {r4, lr} @ under 5 hwords -> std cpy cmp r2, #5 bls .Ltail_cpy16 @ unreconcilable alignment -> std cpy @ if (dst^src)&2 -> alignment impossible mov r3, r0 eor r3, r1 lsl r3, r3, #31 @ (dst^src), bit 1 into carry bcs .Ltail_cpy16 @ (dst^src)&2 : must copy by halfword @ src and dst have same alignment -> word align lsl r3, r0, #31 bcc .Lmain_cpy16 @ ~src&2 : already word aligned @ aligning is necessary: copy 1 hword and align ldrh r3, [r1] strh r3, [r0] add r0, #2 add r1, #2 sub r2, r2, #1 @ right, and for the REAL work, we're gonna use memcpy32 .Lmain_cpy16: lsl r4, r2, #31 lsr r2, r2, #1 ldr r3, .Lpool_cpy16 bx r3 nop @ NOTE: r0,r1 are altered by memcpy32, but in exactly the right @ way, so we can use them as is. lsr r2, r4, #31 beq .Lend_cpy16 .Ltail_cpy16: sub r2, #1 bcc .Lend_cpy16 @ r2 was 0, bug out lsl r2, r2, #1 .Lres_cpy16: ldrh r3, [r1, r2] strh r3, [r0, r2] sub r2, r2, #2 bcs .Lres_cpy16 .Lend_cpy16: pop {r4} pop {r3} bx r3 .align 2 .Lpool_cpy16: .word memcpy32 @ === procedure memset16(dest: pointer; hw: u16; hwcount: u32); ================ @ Fill for halfwords. @ Uses memset32() if hwcount>5 @ param dest Destination address. @ param hw Source halfword (not address). @ param hwcount Number of halfwords to fill. @ note: dest must be halfword aligned. @ note: r0 returns as dest + hwcount. @ Reglist: @ r0, r1: dst, hw @ r2, r4: hwcount @ r3: tmp; and data buffer .text .align 2 .code 16 .global memset16 .thumb_func memset16: push {r4, lr} @ under 6 hwords -> std set cmp r2, #5 bls .Ltail_set16 @ dst not word aligned: copy 1 hword and align lsl r3, r0, #31 bcc .Lmain_set16 strh r1, [r0] add r0, #2 sub r2, r2, #1 @ Again, memset32 does the real work .Lmain_set16: lsl r4, r1, #16 orr r1, r4 lsl r4, r2, #31 lsr r2, r2, #1 ldr r3, .Lpool_set16 bx r3 nop @ NOTE: r0 is altered by memset32, but in exactly the right @ way, so we can use is as is. r1 is now doubled though. lsr r2, r4, #31 beq .Lend_set16 lsr r1, #16 .Ltail_set16: sub r2, #1 bcc .Lend_set16 @ r2 was 0, bug out lsl r2, r2, #1 .Lres_set16: strh r1, [r0, r2] sub r2, r2, #2 bcs .Lres_set16 .Lend_set16: pop {r4} pop {r3} bx r3 .align 2 .Lpool_set16: .word memset32