big: implement 386 assembly routines

7x speedup on big and crypto/rsa unit tests. also dropped useAsm in favor of making the asm stubs jump to the Go versions. R=agl1 CC=golang-dev, gri http://codereview.appspot.com/157062
author: Russ Cox <rsc@golang.org> 2009-11-18 09:54:51 -0800
committer: Russ Cox <rsc@golang.org> 2009-11-18 09:54:51 -0800
commit: 082835bd1c72eabce2b659e3a4d35161b4b0097e (patch)
tree: a8cbc1f88dde9182b9c54bae92e9ef5711043de8 /src/pkg/big
parent: 4bc38508917e116c3bb3e84f138c8273f38f9cd1 (diff)
download: golang-082835bd1c72eabce2b659e3a4d35161b4b0097e.tar.gz
4 files changed, 184 insertions, 29 deletions
diff --git a/src/pkg/big/arith.go b/src/pkg/big/arith.go
index 8a565e790..3dcbe637f 100644
--- a/src/pkg/big/arith.go
+++ b/src/pkg/big/arith.go
@@ -298,20 +298,18 @@ var (
 )
 
 
-// UseAsm returns true if the assembly routines are enabled.
-func useAsm() bool
-
 func init() {
-	if useAsm() {
-		// Install assembly routines.
-		addVV = addVV_s;
-		subVV = subVV_s;
-		addVW = addVW_s;
-		subVW = subVW_s;
-		mulAddVWW = mulAddVWW_s;
-		addMulVVW = addMulVVW_s;
-		divWVW = divWVW_s;
-	}
+	// Uncomment to use generic routines.
+	//return;
+
+	// Install assembly routines.
+	addVV = addVV_s;
+	subVV = subVV_s;
+	addVW = addVW_s;
+	subVW = subVW_s;
+	mulAddVWW = mulAddVWW_s;
+	addMulVVW = addMulVVW_s;
+	divWVW = divWVW_s;
 }
 
 
diff --git a/src/pkg/big/arith_386.s b/src/pkg/big/arith_386.s
index 885b15273..2f89182c1 100644
--- a/src/pkg/big/arith_386.s
+++ b/src/pkg/big/arith_386.s
@@ -5,17 +5,171 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
 
-TEXT big·useAsm(SB),7,$0
-	MOVB $0, 4(SP)	// assembly routines disabled
+// func addVV_s(z, x, y *Word, n int) (c Word)
+TEXT big·addVV_s(SB),7,$0
+	MOVL z+0(FP), DI
+	MOVL x+4(FP), SI
+	MOVL y+8(FP), CX
+	MOVL n+12(FP), BP
+	MOVL $0, BX         // i = 0
+	MOVL $0, DX         // c = 0
+	JMP E1
+
+L1:	MOVL (SI)(BX*4), AX
+	RCRL $1, DX
+	ADCL (CX)(BX*4), AX
+	RCLL $1, DX
+	MOVL AX, (DI)(BX*4)
+	ADDL $1, BX			// i++
+
+E1:	CMPL BX, BP		// i < n
+	JL L1
+
+	MOVL DX, c+16(FP)
 	RET
 
 
-// TODO(gri) Implement these routines and enable them.
-TEXT big·addVV_s(SB),7,$0
+// func subVV_s(z, x, y *Word, n int) (c Word)
+// (same as addVV_s except for SBBL instead of ADCL and label names)
 TEXT big·subVV_s(SB),7,$0
+	MOVL z+0(FP), DI
+	MOVL x+4(FP), SI
+	MOVL y+8(FP), CX
+	MOVL n+12(FP), BP
+	MOVL $0, BX         // i = 0
+	MOVL $0, DX         // c = 0
+	JMP E2
+
+L2:	MOVL (SI)(BX*4), AX
+	RCRL $1, DX
+	SBBL (CX)(BX*4), AX
+	RCLL $1, DX
+	MOVL AX, (DI)(BX*4)
+	ADDL $1, BX         // i++
+
+E2:	CMPL BX, BP        // i < n
+	JL L2
+
+	MOVL DX, c+16(FP)
+	RET
+
+
+// func addVW_s(z, x *Word, y Word, n int) (c Word)
 TEXT big·addVW_s(SB),7,$0
+	MOVL z+0(FP), DI
+	MOVL x+4(FP), SI
+	MOVL y+8(FP), AX   // c = y
+	MOVL n+12(FP), BP
+	MOVL $0, BX         // i = 0
+	JMP E3
+
+L3:	ADDL (SI)(BX*4), AX
+	MOVL AX, (DI)(BX*4)
+	RCLL $1, AX
+	ANDL $1, AX
+	ADDL $1, BX         // i++
+
+E3:	CMPL BX, BP        // i < n
+	JL L3
+
+	MOVL AX, c+16(FP)
+	RET
+
+
+// func subVW_s(z, x *Word, y Word, n int) (c Word)
 TEXT big·subVW_s(SB),7,$0
+	MOVL z+0(FP), DI
+	MOVL x+4(FP), SI
+	MOVL y+8(FP), AX   // c = y
+	MOVL n+12(FP), BP
+	MOVL $0, BX         // i = 0
+	JMP E4
+
+L4:	MOVL (SI)(BX*4), DX	// TODO(gri) is there a reverse SUBL?
+	SUBL AX, DX
+	MOVL DX, (DI)(BX*4)
+	RCLL $1, AX
+	ANDL $1, AX
+	ADDL $1, BX          // i++
+
+E4:	CMPL BX, BP         // i < n
+	JL L4
+
+	MOVL AX, c+16(FP)
+	RET
+
+
+// func mulAddVWW_s(z, x *Word, y, r Word, n int) (c Word)
 TEXT big·mulAddVWW_s(SB),7,$0
+	MOVL z+0(FP), DI
+	MOVL x+4(FP), SI
+	MOVL y+8(FP), BP
+	MOVL r+12(FP), CX   // c = r
+	MOVL n+16(FP), BX
+	LEAL	(SI)(BX*4), SI
+	LEAL	(DI)(BX*4), DI
+	NEGL BX  // i = -n
+	JMP E5
+
+L5:	MOVL (SI)(BX*4), AX
+	MULL BP
+	ADDL CX, AX
+	ADCL $0, DX
+	MOVL AX, (DI)(BX*4)
+	MOVL DX, CX
+	ADDL $1, BX         // i++
+
+E5:	CMPL BX, $0        // i < 0
+	JL L5
+
+	MOVL CX, c+20(FP)
+	RET
+
+
+// func addMulVVW_s(z, x *Word, y Word, n int) (c Word)
 TEXT big·addMulVVW_s(SB),7,$0
+	MOVL z+0(FP), DI
+	MOVL x+4(FP), SI
+	MOVL y+8(FP), BP
+	MOVL n+12(FP), BX
+	LEAL	(SI)(BX*4), SI
+	LEAL	(DI)(BX*4), DI
+	NEGL BX  // i = -n
+	MOVL $0, CX         // c = 0
+	JMP E6
+
+L6:	MOVL (SI)(BX*4), AX
+	MULL BP
+	ADDL (DI)(BX*4), AX
+	ADCL $0, DX
+	ADDL CX, AX
+	ADCL $0, DX
+	MOVL AX, (DI)(BX*4)
+	MOVL DX, CX
+	ADDL $1, BX         // i++
+
+E6:	CMPL BX, $0        // i < 0
+	JL L6
+
+	MOVL CX, c+16(FP)
+	RET
+
+
+// divWVW_s(z* Word, xn Word, x *Word, y Word, n int) (r Word)
 TEXT big·divWVW_s(SB),7,$0
+	MOVL z+0(FP), DI
+	MOVL xn+4(FP), DX   // r = xn
+	MOVL x+8(FP), SI
+	MOVL y+12(FP), CX
+	MOVL n+16(FP), BX   // i = n
+	JMP E7
+
+L7:	MOVL (SI)(BX*4), AX
+	DIVL CX
+	MOVL AX, (DI)(BX*4)
+
+E7:	SUBL $1, BX         // i--
+	JGE L7              // i >= 0
+
+	MOVL DX, r+20(FP)
 	RET
diff --git a/src/pkg/big/arith_amd64.s b/src/pkg/big/arith_amd64.s
index 4733a7c3a..f9b070b74 100644
--- a/src/pkg/big/arith_amd64.s
+++ b/src/pkg/big/arith_amd64.s
@@ -5,11 +5,6 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
 
-TEXT big·useAsm(SB),7,$0
-	MOVB $1, 8(SP)  // assembly routines enabled
-	RET
-
-
 // TODO(gri) - experiment with unrolled loops for faster execution
 
 // func addVV_s(z, x, y *Word, n int) (c Word)
diff --git a/src/pkg/big/arith_arm.s b/src/pkg/big/arith_arm.s
index 885b15273..8bb1e9c28 100644
--- a/src/pkg/big/arith_arm.s
+++ b/src/pkg/big/arith_arm.s
@@ -5,17 +5,25 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
 
-TEXT big·useAsm(SB),7,$0
-	MOVB $0, 4(SP)	// assembly routines disabled
-	RET
-
-
-// TODO(gri) Implement these routines and enable them.
+// TODO(gri) Implement these routines.
 TEXT big·addVV_s(SB),7,$0
+	JMP big·addVV_g(SB)
+
 TEXT big·subVV_s(SB),7,$0
+	JMP big·subVV_g(SB)
+
 TEXT big·addVW_s(SB),7,$0
+	JMP big·addVW_g(SB)
+
 TEXT big·subVW_s(SB),7,$0
+	JMP big·subVW_g(SB)
+
 TEXT big·mulAddVWW_s(SB),7,$0
+	JMP big·mulAddVWW_g(SB)
+
 TEXT big·addMulVVW_s(SB),7,$0
+	JMP big·addMulVVW_g(SB)
+
 TEXT big·divWVW_s(SB),7,$0
-	RET
+	JMP big·divWVW_g(SB)
+
author	Russ Cox <rsc@golang.org>	2009-11-18 09:54:51 -0800
committer	Russ Cox <rsc@golang.org>	2009-11-18 09:54:51 -0800
commit	082835bd1c72eabce2b659e3a4d35161b4b0097e (patch)
tree	a8cbc1f88dde9182b9c54bae92e9ef5711043de8 /src/pkg/big
parent	4bc38508917e116c3bb3e84f138c8273f38f9cd1 (diff)
download	golang-082835bd1c72eabce2b659e3a4d35161b4b0097e.tar.gz