diff options
Diffstat (limited to 'src/pkg/big/arith_arm.s')
-rw-r--r-- | src/pkg/big/arith_arm.s | 289 |
1 files changed, 280 insertions, 9 deletions
diff --git a/src/pkg/big/arith_arm.s b/src/pkg/big/arith_arm.s index c8a45efc4..e4a9a962c 100644 --- a/src/pkg/big/arith_arm.s +++ b/src/pkg/big/arith_arm.s @@ -5,31 +5,302 @@ // This file provides fast assembly versions for the elementary // arithmetic operations on vectors implemented in arith.go. -// TODO(gri) Implement these routines. +#define CFLAG 29 // bit position of carry flag + +// func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB),7,$0 - B ·addVV_g(SB) + MOVW $0, R0 + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R4 + MOVW R4<<2, R4 + ADD R1, R4 + B E1 +L1: + MOVW.P 4(R2), R5 + MOVW.P 4(R3), R6 + MOVW R0, CPSR + ADC.S R6, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 +E1: + CMP R1, R4 + BNE L1 + + MOVW R0>>CFLAG, R0 + AND $1, R0 + MOVW R0, c+36(FP) + RET + +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SBC instead of ADC and label names) TEXT ·subVV(SB),7,$0 - B ·subVV_g(SB) + MOVW $(1<<CFLAG), R0 + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R4 + MOVW R4<<2, R4 + ADD R1, R4 + B E2 +L2: + MOVW.P 4(R2), R5 + MOVW.P 4(R3), R6 + MOVW R0, CPSR + SBC.S R6, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 +E2: + CMP R1, R4 + BNE L2 + + MOVW R0>>CFLAG, R0 + AND $1, R0 + EOR $1, R0 + MOVW R0, c+36(FP) + RET + +// func addVW(z, x []Word, y Word) (c Word) TEXT ·addVW(SB),7,$0 - B ·addVW_g(SB) + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R4 + MOVW R4<<2, R4 + ADD R1, R4 + CMP R1, R4 + BNE L3a + MOVW R3, c+28(FP) + RET +L3a: + MOVW.P 4(R2), R5 + ADD.S R3, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 + B E3 +L3: + MOVW.P 4(R2), R5 + MOVW R0, CPSR + ADC.S $0, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 +E3: + CMP R1, R4 + BNE L3 + + MOVW R0>>CFLAG, R0 + AND $1, R0 + MOVW R0, c+28(FP) + RET + TEXT ·subVW(SB),7,$0 - B ·subVW_g(SB) + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R4 + MOVW R4<<2, R4 + ADD R1, R4 + CMP R1, R4 + BNE L4a + MOVW R3, c+28(FP) + RET +L4a: + MOVW.P 4(R2), R5 + SUB.S R3, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 + B E4 +L4: + MOVW.P 4(R2), R5 + MOVW R0, CPSR + SBC.S $0, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 +E4: + CMP R1, R4 + BNE L4 + + MOVW R0>>CFLAG, R0 + AND $1, R0 + EOR $1, R0 + MOVW R0, c+28(FP) + RET + +// func shlVW(z, x []Word, s Word) (c Word) TEXT ·shlVW(SB),7,$0 - B ·shlVW_g(SB) + MOVW n+4(FP), R5 + CMP $0, R5 + BEQ X7 + + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW R5<<2, R5 + ADD R5, R2 + ADD R1, R5 + MOVW s+24(FP), R3 + CMP $0, R3 // shift 0 is special + BEQ Y7 + ADD $4, R1 // stop one word early + MOVW $32, R4 + SUB R3, R4 + MOVW $0, R7 + + MOVW.W -4(R2), R6 + MOVW R6<<R3, R7 + MOVW R6>>R4, R6 + MOVW R6, c+28(FP) + B E7 + +L7: + MOVW.W -4(R2), R6 + ORR R6>>R4, R7 + MOVW.W R7, -4(R5) + MOVW R6<<R3, R7 +E7: + CMP R1, R5 + BNE L7 + + MOVW R7, -4(R5) + RET + +Y7: // copy loop, because shift 0 == shift 32 + MOVW.W -4(R2), R6 + MOVW.W R6, -4(R5) + CMP R1, R5 + BNE Y7 + +X7: + MOVW $0, R1 + MOVW R1, c+28(FP) + RET + TEXT ·shrVW(SB),7,$0 - B ·shrVW_g(SB) + MOVW n+4(FP), R5 + CMP $0, R5 + BEQ X6 + + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW R5<<2, R5 + ADD R1, R5 + MOVW s+24(FP), R3 + CMP $0, R3 // shift 0 is special + BEQ Y6 + SUB $4, R5 // stop one word early + MOVW $32, R4 + SUB R3, R4 + MOVW $0, R7 + + // first word + MOVW.P 4(R2), R6 + MOVW R6>>R3, R7 + MOVW R6<<R4, R6 + MOVW R6, c+28(FP) + B E6 + + // word loop +L6: + MOVW.P 4(R2), R6 + ORR R6<<R4, R7 + MOVW.P R7, 4(R1) + MOVW R6>>R3, R7 +E6: + CMP R1, R5 + BNE L6 + + MOVW R7, 0(R1) + RET + +Y6: // copy loop, because shift 0 == shift 32 + MOVW.P 4(R2), R6 + MOVW.P R6, 4(R1) + CMP R1, R5 + BNE Y6 + +X6: + MOVW $0, R1 + MOVW R1, c+28(FP) + RET + TEXT ·mulAddVWW(SB),7,$0 - B ·mulAddVWW_g(SB) + MOVW $0, R0 + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW r+28(FP), R4 + MOVW n+4(FP), R5 + MOVW R5<<2, R5 + ADD R1, R5 + B E8 + + // word loop +L8: + MOVW.P 4(R2), R6 + MULLU R6, R3, (R7, R6) + ADD.S R4, R6 + ADC R0, R7 + MOVW.P R6, 4(R1) + MOVW R7, R4 +E8: + CMP R1, R5 + BNE L8 + + MOVW R4, c+32(FP) + RET + TEXT ·addMulVVW(SB),7,$0 - B ·addMulVVW_g(SB) + MOVW $0, R0 + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R5 + MOVW R5<<2, R5 + ADD R1, R5 + MOVW $0, R4 + B E9 + + // word loop +L9: + MOVW.P 4(R2), R6 + MULLU R6, R3, (R7, R6) + ADD.S R4, R6 + ADC R0, R7 + MOVW 0(R1), R4 + ADD.S R4, R6 + ADC R0, R7 + MOVW.P R6, 4(R1) + MOVW R7, R4 +E9: + CMP R1, R5 + BNE L9 + + MOVW R4, c+28(FP) + RET + TEXT ·divWVW(SB),7,$0 + // ARM has no multiword division, so use portable code. B ·divWVW_g(SB) + +TEXT ·divWW(SB),7,$0 + // ARM has no multiword division, so use portable code. + B ·divWW_g(SB) + + +// func mulWW(x, y Word) (z1, z0 Word) +TEXT ·mulWW(SB),7,$0 + MOVW x+0(FP), R1 + MOVW y+4(FP), R2 + MULLU R1, R2, (R4, R3) + MOVW R4, z1+8(FP) + MOVW R3, z0+12(FP) + RET |