summaryrefslogtreecommitdiff
path: root/src/pkg/big
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2009-11-18 09:54:51 -0800
committerRuss Cox <rsc@golang.org>2009-11-18 09:54:51 -0800
commit082835bd1c72eabce2b659e3a4d35161b4b0097e (patch)
treea8cbc1f88dde9182b9c54bae92e9ef5711043de8 /src/pkg/big
parent4bc38508917e116c3bb3e84f138c8273f38f9cd1 (diff)
downloadgolang-082835bd1c72eabce2b659e3a4d35161b4b0097e.tar.gz
big: implement 386 assembly routines
7x speedup on big and crypto/rsa unit tests. also dropped useAsm in favor of making the asm stubs jump to the Go versions. R=agl1 CC=golang-dev, gri http://codereview.appspot.com/157062
Diffstat (limited to 'src/pkg/big')
-rw-r--r--src/pkg/big/arith.go24
-rw-r--r--src/pkg/big/arith_386.s162
-rw-r--r--src/pkg/big/arith_amd64.s5
-rw-r--r--src/pkg/big/arith_arm.s22
4 files changed, 184 insertions, 29 deletions
diff --git a/src/pkg/big/arith.go b/src/pkg/big/arith.go
index 8a565e790..3dcbe637f 100644
--- a/src/pkg/big/arith.go
+++ b/src/pkg/big/arith.go
@@ -298,20 +298,18 @@ var (
)
-// UseAsm returns true if the assembly routines are enabled.
-func useAsm() bool
-
func init() {
- if useAsm() {
- // Install assembly routines.
- addVV = addVV_s;
- subVV = subVV_s;
- addVW = addVW_s;
- subVW = subVW_s;
- mulAddVWW = mulAddVWW_s;
- addMulVVW = addMulVVW_s;
- divWVW = divWVW_s;
- }
+ // Uncomment to use generic routines.
+ //return;
+
+ // Install assembly routines.
+ addVV = addVV_s;
+ subVV = subVV_s;
+ addVW = addVW_s;
+ subVW = subVW_s;
+ mulAddVWW = mulAddVWW_s;
+ addMulVVW = addMulVVW_s;
+ divWVW = divWVW_s;
}
diff --git a/src/pkg/big/arith_386.s b/src/pkg/big/arith_386.s
index 885b15273..2f89182c1 100644
--- a/src/pkg/big/arith_386.s
+++ b/src/pkg/big/arith_386.s
@@ -5,17 +5,171 @@
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
-TEXT big·useAsm(SB),7,$0
- MOVB $0, 4(SP) // assembly routines disabled
+// func addVV_s(z, x, y *Word, n int) (c Word)
+TEXT big·addVV_s(SB),7,$0
+ MOVL z+0(FP), DI
+ MOVL x+4(FP), SI
+ MOVL y+8(FP), CX
+ MOVL n+12(FP), BP
+ MOVL $0, BX // i = 0
+ MOVL $0, DX // c = 0
+ JMP E1
+
+L1: MOVL (SI)(BX*4), AX
+ RCRL $1, DX
+ ADCL (CX)(BX*4), AX
+ RCLL $1, DX
+ MOVL AX, (DI)(BX*4)
+ ADDL $1, BX // i++
+
+E1: CMPL BX, BP // i < n
+ JL L1
+
+ MOVL DX, c+16(FP)
RET
-// TODO(gri) Implement these routines and enable them.
-TEXT big·addVV_s(SB),7,$0
+// func subVV_s(z, x, y *Word, n int) (c Word)
+// (same as addVV_s except for SBBL instead of ADCL and label names)
TEXT big·subVV_s(SB),7,$0
+ MOVL z+0(FP), DI
+ MOVL x+4(FP), SI
+ MOVL y+8(FP), CX
+ MOVL n+12(FP), BP
+ MOVL $0, BX // i = 0
+ MOVL $0, DX // c = 0
+ JMP E2
+
+L2: MOVL (SI)(BX*4), AX
+ RCRL $1, DX
+ SBBL (CX)(BX*4), AX
+ RCLL $1, DX
+ MOVL AX, (DI)(BX*4)
+ ADDL $1, BX // i++
+
+E2: CMPL BX, BP // i < n
+ JL L2
+
+ MOVL DX, c+16(FP)
+ RET
+
+
+// func addVW_s(z, x *Word, y Word, n int) (c Word)
TEXT big·addVW_s(SB),7,$0
+ MOVL z+0(FP), DI
+ MOVL x+4(FP), SI
+ MOVL y+8(FP), AX // c = y
+ MOVL n+12(FP), BP
+ MOVL $0, BX // i = 0
+ JMP E3
+
+L3: ADDL (SI)(BX*4), AX
+ MOVL AX, (DI)(BX*4)
+ RCLL $1, AX
+ ANDL $1, AX
+ ADDL $1, BX // i++
+
+E3: CMPL BX, BP // i < n
+ JL L3
+
+ MOVL AX, c+16(FP)
+ RET
+
+
+// func subVW_s(z, x *Word, y Word, n int) (c Word)
TEXT big·subVW_s(SB),7,$0
+ MOVL z+0(FP), DI
+ MOVL x+4(FP), SI
+ MOVL y+8(FP), AX // c = y
+ MOVL n+12(FP), BP
+ MOVL $0, BX // i = 0
+ JMP E4
+
+L4: MOVL (SI)(BX*4), DX // TODO(gri) is there a reverse SUBL?
+ SUBL AX, DX
+ MOVL DX, (DI)(BX*4)
+ RCLL $1, AX
+ ANDL $1, AX
+ ADDL $1, BX // i++
+
+E4: CMPL BX, BP // i < n
+ JL L4
+
+ MOVL AX, c+16(FP)
+ RET
+
+
+// func mulAddVWW_s(z, x *Word, y, r Word, n int) (c Word)
TEXT big·mulAddVWW_s(SB),7,$0
+ MOVL z+0(FP), DI
+ MOVL x+4(FP), SI
+ MOVL y+8(FP), BP
+ MOVL r+12(FP), CX // c = r
+ MOVL n+16(FP), BX
+ LEAL (SI)(BX*4), SI
+ LEAL (DI)(BX*4), DI
+ NEGL BX // i = -n
+ JMP E5
+
+L5: MOVL (SI)(BX*4), AX
+ MULL BP
+ ADDL CX, AX
+ ADCL $0, DX
+ MOVL AX, (DI)(BX*4)
+ MOVL DX, CX
+ ADDL $1, BX // i++
+
+E5: CMPL BX, $0 // i < 0
+ JL L5
+
+ MOVL CX, c+20(FP)
+ RET
+
+
+// func addMulVVW_s(z, x *Word, y Word, n int) (c Word)
TEXT big·addMulVVW_s(SB),7,$0
+ MOVL z+0(FP), DI
+ MOVL x+4(FP), SI
+ MOVL y+8(FP), BP
+ MOVL n+12(FP), BX
+ LEAL (SI)(BX*4), SI
+ LEAL (DI)(BX*4), DI
+ NEGL BX // i = -n
+ MOVL $0, CX // c = 0
+ JMP E6
+
+L6: MOVL (SI)(BX*4), AX
+ MULL BP
+ ADDL (DI)(BX*4), AX
+ ADCL $0, DX
+ ADDL CX, AX
+ ADCL $0, DX
+ MOVL AX, (DI)(BX*4)
+ MOVL DX, CX
+ ADDL $1, BX // i++
+
+E6: CMPL BX, $0 // i < 0
+ JL L6
+
+ MOVL CX, c+16(FP)
+ RET
+
+
+// divWVW_s(z* Word, xn Word, x *Word, y Word, n int) (r Word)
TEXT big·divWVW_s(SB),7,$0
+ MOVL z+0(FP), DI
+ MOVL xn+4(FP), DX // r = xn
+ MOVL x+8(FP), SI
+ MOVL y+12(FP), CX
+ MOVL n+16(FP), BX // i = n
+ JMP E7
+
+L7: MOVL (SI)(BX*4), AX
+ DIVL CX
+ MOVL AX, (DI)(BX*4)
+
+E7: SUBL $1, BX // i--
+ JGE L7 // i >= 0
+
+ MOVL DX, r+20(FP)
RET
diff --git a/src/pkg/big/arith_amd64.s b/src/pkg/big/arith_amd64.s
index 4733a7c3a..f9b070b74 100644
--- a/src/pkg/big/arith_amd64.s
+++ b/src/pkg/big/arith_amd64.s
@@ -5,11 +5,6 @@
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
-TEXT big·useAsm(SB),7,$0
- MOVB $1, 8(SP) // assembly routines enabled
- RET
-
-
// TODO(gri) - experiment with unrolled loops for faster execution
// func addVV_s(z, x, y *Word, n int) (c Word)
diff --git a/src/pkg/big/arith_arm.s b/src/pkg/big/arith_arm.s
index 885b15273..8bb1e9c28 100644
--- a/src/pkg/big/arith_arm.s
+++ b/src/pkg/big/arith_arm.s
@@ -5,17 +5,25 @@
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
-TEXT big·useAsm(SB),7,$0
- MOVB $0, 4(SP) // assembly routines disabled
- RET
-
-
-// TODO(gri) Implement these routines and enable them.
+// TODO(gri) Implement these routines.
TEXT big·addVV_s(SB),7,$0
+ JMP big·addVV_g(SB)
+
TEXT big·subVV_s(SB),7,$0
+ JMP big·subVV_g(SB)
+
TEXT big·addVW_s(SB),7,$0
+ JMP big·addVW_g(SB)
+
TEXT big·subVW_s(SB),7,$0
+ JMP big·subVW_g(SB)
+
TEXT big·mulAddVWW_s(SB),7,$0
+ JMP big·mulAddVWW_g(SB)
+
TEXT big·addMulVVW_s(SB),7,$0
+ JMP big·addMulVVW_g(SB)
+
TEXT big·divWVW_s(SB),7,$0
- RET
+ JMP big·divWVW_g(SB)
+