27 files changed, 1165 insertions, 97 deletions
diff --git a/src/pkg/crypto/cipher/example_test.go b/src/pkg/crypto/cipher/example_test.go
index e0027cac2..373f6791b 100644
--- a/src/pkg/crypto/cipher/example_test.go
+++ b/src/pkg/crypto/cipher/example_test.go
@@ -233,7 +233,7 @@ func ExampleStreamReader() {
 	}
 	defer outFile.Close()
 
-	reader := &cipher.StreamReader{stream, inFile}
+	reader := &cipher.StreamReader{S: stream, R: inFile}
 	// Copy the input file to the output file, decrypting as we go.
 	if _, err := io.Copy(outFile, reader); err != nil {
 		panic(err)
@@ -270,7 +270,7 @@ func ExampleStreamWriter() {
 	}
 	defer outFile.Close()
 
-	writer := &cipher.StreamWriter{stream, outFile, nil}
+	writer := &cipher.StreamWriter{S: stream, W: outFile}
 	// Copy the input file to the output file, encrypting as we go.
 	if _, err := io.Copy(writer, inFile); err != nil {
 		panic(err)
diff --git a/src/pkg/crypto/dsa/dsa.go b/src/pkg/crypto/dsa/dsa.go
index 05766a2f1..5a2a65744 100644
--- a/src/pkg/crypto/dsa/dsa.go
+++ b/src/pkg/crypto/dsa/dsa.go
@@ -144,8 +144,6 @@ GeneratePrimes:
 		params.G = g
 		return
 	}
-
-	panic("unreachable")
 }
 
 // GenerateKey generates a public&private key pair. The Parameters of the
diff --git a/src/pkg/crypto/dsa/dsa_test.go b/src/pkg/crypto/dsa/dsa_test.go
index 177aa444d..568416d0d 100644
--- a/src/pkg/crypto/dsa/dsa_test.go
+++ b/src/pkg/crypto/dsa/dsa_test.go
@@ -63,8 +63,9 @@ func testParameterGeneration(t *testing.T, sizes ParameterSizes, L, N int) {
 }
 
 func TestParameterGeneration(t *testing.T) {
-	// This test is too slow to run all the time.
-	return
+	if testing.Short() {
+		t.Skip("skipping parameter generation test in short mode")
+	}
 
 	testParameterGeneration(t, L1024N160, 1024, 160)
 	testParameterGeneration(t, L2048N224, 2048, 224)
diff --git a/src/pkg/crypto/ecdsa/ecdsa.go b/src/pkg/crypto/ecdsa/ecdsa.go
index 512d20c63..255000229 100644
--- a/src/pkg/crypto/ecdsa/ecdsa.go
+++ b/src/pkg/crypto/ecdsa/ecdsa.go
@@ -49,7 +49,7 @@ func randFieldElement(c elliptic.Curve, rand io.Reader) (k *big.Int, err error)
 	return
 }
 
-// GenerateKey generates a public&private key pair.
+// GenerateKey generates a public and private key pair.
 func GenerateKey(c elliptic.Curve, rand io.Reader) (priv *PrivateKey, err error) {
 	k, err := randFieldElement(c, rand)
 	if err != nil {
diff --git a/src/pkg/crypto/md5/gen.go b/src/pkg/crypto/md5/gen.go
index 966bdae26..275b4aeea 100644
--- a/src/pkg/crypto/md5/gen.go
+++ b/src/pkg/crypto/md5/gen.go
@@ -161,6 +161,11 @@ var data = Data{
 }
 
 var program = `
+// DO NOT EDIT.
+// Generate with: go run gen.go{{if .Full}} -full{{end}} | gofmt >md5block.go
+
+// +build !amd64
+
 package md5
 
 import (
@@ -186,6 +191,16 @@ import (
 	}
 {{end}}
 
+const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
+
+var littleEndian bool
+
+func init() {
+	x := uint32(0x04030201)
+	y := [4]byte{0x1, 0x2, 0x3, 0x4}
+	littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
+}
+
 func block(dig *digest, p []byte) {
 	a := dig.s[0]
 	b := dig.s[1]
@@ -197,13 +212,13 @@ func block(dig *digest, p []byte) {
 		aa, bb, cc, dd := a, b, c, d
 
 		// This is a constant condition - it is not evaluated on each iteration.
-		if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" {
+		if x86 {
 			// MD5 was designed so that x86 processors can just iterate
 			// over the block data directly as uint32s, and we generate
 			// less code and run 1.3x faster if we take advantage of that.
 			// My apologies.
 			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
-		} else if uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+		} else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
 			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
 		} else {
 			X = &xbuf
diff --git a/src/pkg/crypto/md5/md5block.go b/src/pkg/crypto/md5/md5block.go
index 0ca421774..a376fbee9 100644
--- a/src/pkg/crypto/md5/md5block.go
+++ b/src/pkg/crypto/md5/md5block.go
@@ -1,3 +1,8 @@
+// DO NOT EDIT.
+// Generate with: go run gen.go -full | gofmt >md5block.go
+
+// +build !amd64,!386
+
 package md5
 
 import (
diff --git a/src/pkg/crypto/md5/md5block_386.s b/src/pkg/crypto/md5/md5block_386.s
new file mode 100644
index 000000000..3ce15e37f
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_386.s
@@ -0,0 +1,180 @@
+// Original source:
+//	http://www.zorinaq.com/papers/md5-amd64.html
+//	http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 8a assembly, and adjusted for 386,
+// by the Go Authors.
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+	XORL	c, BP; \
+	LEAL	const(a)(DI*1), a; \
+	ANDL	b, BP; \
+	XORL d, BP; \
+	MOVL (index*4)(SI), DI; \
+	ADDL BP, a; \
+	ROLL $shift, a; \
+	MOVL c, BP; \
+	ADDL b, a
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(DI*1),a; \
+	MOVL	d,		DI; \
+	ANDL	b,		DI; \
+	MOVL	d,		BP; \
+	NOTL	BP; \
+	ANDL	c,		BP; \
+	ORL	DI,		BP; \
+	MOVL	(index*4)(SI),DI; \
+	ADDL	BP,		a; \
+	ROLL	$shift,	a; \
+	ADDL	b,		a
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(DI*1),a; \
+	MOVL	(index*4)(SI),DI; \
+	XORL	d,		BP; \
+	XORL	b,		BP; \
+	ADDL	BP,		a; \
+	ROLL	$shift,		a; \
+	MOVL	b,		BP; \
+	ADDL	b,		a
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(DI*1),a; \
+	ORL	b,		BP; \
+	XORL	c,		BP; \
+	ADDL	BP,		a; \
+	MOVL	(index*4)(SI),DI; \
+	MOVL	$0xffffffff,	BP; \
+	ROLL	$shift,		a; \
+	XORL	c,		BP; \
+	ADDL	b,		a
+
+TEXT	·block(SB),7,$24-16
+	MOVL	dig+0(FP),	BP
+	MOVL	p+4(FP),	SI
+	MOVL	p_len+8(FP), DX
+	SHRL	$6,		DX
+	SHLL	$6,		DX
+
+	LEAL	(SI)(DX*1),	DI
+	MOVL	(0*4)(BP),	AX
+	MOVL	(1*4)(BP),	BX
+	MOVL	(2*4)(BP),	CX
+	MOVL	(3*4)(BP),	DX
+
+	CMPL	SI,		DI
+	JEQ	end
+
+	MOVL	DI,		16(SP)
+
+loop:
+	MOVL	AX,		0(SP)
+	MOVL	BX,		4(SP)
+	MOVL	CX,		8(SP)
+	MOVL	DX,		12(SP)
+
+	MOVL	(0*4)(SI),	DI
+	MOVL	DX,		BP
+
+	ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+	ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+	ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+	ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+	ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+	ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+	ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+	ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+	ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+	ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+	ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+	ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+	ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+	ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+	ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+	ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+	MOVL	(1*4)(SI),	DI
+	MOVL	DX,		BP
+
+	ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+	ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+	ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+	ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+	ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+	ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+	ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+	ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+	ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+	ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+	ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+	ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+	ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+	ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+	ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+	ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+ 
+	MOVL	(5*4)(SI),	DI
+	MOVL	CX,		BP
+
+	ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+	ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+	ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+	ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+	ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+	ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+	ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+	ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+	ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+	ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+	ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+	ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+	ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+	ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+	ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+	ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+	MOVL	(0*4)(SI),	DI
+	MOVL	$0xffffffff,	BP
+	XORL	DX,		BP
+
+	ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+	ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+	ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+	ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+	ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+	ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+	ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+	ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+	ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+	ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+	ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+	ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+	ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+	ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+	ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+	ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+	ADDL	0(SP),	AX
+	ADDL	4(SP),	BX
+	ADDL	8(SP),	CX
+	ADDL	12(SP),	DX
+
+	ADDL	$64,		SI
+	CMPL	SI,		16(SP)
+	JB	loop
+
+end:
+	MOVL	dig+0(FP),	BP
+	MOVL	AX,		(0*4)(BP)
+	MOVL	BX,		(1*4)(BP)
+	MOVL	CX,		(2*4)(BP)
+	MOVL	DX,		(3*4)(BP)
+	RET
diff --git a/src/pkg/crypto/md5/md5block_amd64.s b/src/pkg/crypto/md5/md5block_amd64.s
new file mode 100644
index 000000000..e6420a28a
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_amd64.s
@@ -0,0 +1,177 @@
+// Original source:
+//	http://www.zorinaq.com/papers/md5-amd64.html
+//	http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 6a assembly by the Go Authors.
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+TEXT	·block(SB),7,$0-32
+	MOVQ	dig+0(FP),	BP
+	MOVQ	p+8(FP),	SI
+	MOVQ	p_len+16(FP), DX
+	SHRQ	$6,		DX
+	SHLQ	$6,		DX
+
+	LEAQ	(SI)(DX*1),	DI
+	MOVL	(0*4)(BP),	AX
+	MOVL	(1*4)(BP),	BX
+	MOVL	(2*4)(BP),	CX
+	MOVL	(3*4)(BP),	DX
+
+	CMPQ	SI,		DI
+	JEQ	end
+
+loop:
+	MOVL	AX,		R12
+	MOVL	BX,		R13
+	MOVL	CX,		R14
+	MOVL	DX,		R15
+
+	MOVL	(0*4)(SI),	R8
+	MOVL	DX,		R9
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+	XORL	c, R9; \
+	LEAL	const(a)(R8*1), a; \
+	ANDL	b, R9; \
+	XORL d, R9; \
+	MOVL (index*4)(SI), R8; \
+	ADDL R9, a; \
+	ROLL $shift, a; \
+	MOVL c, R9; \
+	ADDL b, a
+
+	ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+	ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+	ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+	ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+	ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+	ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+	ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+	ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+	ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+	ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+	ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+	ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+	ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+	ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+	ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+	ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+	MOVL	(1*4)(SI),	R8
+	MOVL	DX,		R9
+	MOVL	DX,		R10
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+	NOTL	R9; \
+	LEAL	const(a)(R8*1),a; \
+	ANDL	b,		R10; \
+	ANDL	c,		R9; \
+	MOVL	(index*4)(SI),R8; \
+	ORL	R9,		R10; \
+	MOVL	c,		R9; \
+	ADDL	R10,		a; \
+	MOVL	c,		R10; \
+	ROLL	$shift,	a; \
+	ADDL	b,		a
+
+	ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+	ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+	ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+	ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+	ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+	ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+	ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+	ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+	ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+	ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+	ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+	ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+	ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+	ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+	ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+	ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+ 
+	MOVL	(5*4)(SI),	R8
+	MOVL	CX,		R9
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(R8*1),a; \
+	MOVL	(index*4)(SI),R8; \
+	XORL	d,		R9; \
+	XORL	b,		R9; \
+	ADDL	R9,		a; \
+	ROLL	$shift,		a; \
+	MOVL	b,		R9; \
+	ADDL	b,		a
+
+	ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+	ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+	ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+	ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+	ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+	ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+	ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+	ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+	ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+	ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+	ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+	ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+	ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+	ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+	ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+	ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+	MOVL	(0*4)(SI),	R8
+	MOVL	$0xffffffff,	R9
+	XORL	DX,		R9
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(R8*1),a; \
+	ORL	b,		R9; \
+	XORL	c,		R9; \
+	ADDL	R9,		a; \
+	MOVL	(index*4)(SI),R8; \
+	MOVL	$0xffffffff,	R9; \
+	ROLL	$shift,		a; \
+	XORL	c,		R9; \
+	ADDL	b,		a
+	
+	ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+	ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+	ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+	ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+	ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+	ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+	ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+	ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+	ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+	ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+	ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+	ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+	ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+	ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+	ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+	ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+	ADDL	R12,	AX
+	ADDL	R13,	BX
+	ADDL	R14,	CX
+	ADDL	R15,	DX
+
+	ADDQ	$64,		SI
+	CMPQ	SI,		DI
+	JB	loop
+
+end:
+	MOVL	AX,		(0*4)(BP)
+	MOVL	BX,		(1*4)(BP)
+	MOVL	CX,		(2*4)(BP)
+	MOVL	DX,		(3*4)(BP)
+	RET
diff --git a/src/pkg/crypto/md5/md5block_decl.go b/src/pkg/crypto/md5/md5block_decl.go
new file mode 100644
index 000000000..14190c6ff
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_decl.go
@@ -0,0 +1,9 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 386
+
+package md5
+
+func block(dig *digest, p []byte)
diff --git a/src/pkg/crypto/rand/util.go b/src/pkg/crypto/rand/util.go
index 50e5b162b..0cd5e0e02 100644
--- a/src/pkg/crypto/rand/util.go
+++ b/src/pkg/crypto/rand/util.go
@@ -98,12 +98,13 @@ func Prime(rand io.Reader, bits int) (p *big.Int, err error) {
 			return
 		}
 	}
-
-	return
 }
 
-// Int returns a uniform random value in [0, max).
+// Int returns a uniform random value in [0, max). It panics if max <= 0.
 func Int(rand io.Reader, max *big.Int) (n *big.Int, err error) {
+	if max.Sign() <= 0 {
+		panic("crypto/rand: argument to Int is <= 0")
+	}
 	k := (max.BitLen() + 7) / 8
 
 	// b is the number of bits in the most significant byte of max.
@@ -130,6 +131,4 @@ func Int(rand io.Reader, max *big.Int) (n *big.Int, err error) {
 			return
 		}
 	}
-
-	return
 }
diff --git a/src/pkg/crypto/rc4/rc4.go b/src/pkg/crypto/rc4/rc4.go
index e0c33fa4b..3d717c63b 100644
--- a/src/pkg/crypto/rc4/rc4.go
+++ b/src/pkg/crypto/rc4/rc4.go
@@ -13,7 +13,7 @@ import "strconv"
 
 // A Cipher is an instance of RC4 using a particular key.
 type Cipher struct {
-	s    [256]byte
+	s    [256]uint32
 	i, j uint8
 }
 
@@ -32,11 +32,11 @@ func NewCipher(key []byte) (*Cipher, error) {
 	}
 	var c Cipher
 	for i := 0; i < 256; i++ {
-		c.s[i] = uint8(i)
+		c.s[i] = uint32(i)
 	}
 	var j uint8 = 0
 	for i := 0; i < 256; i++ {
-		j += c.s[i] + key[i%k]
+		j += uint8(c.s[i]) + key[i%k]
 		c.s[i], c.s[j] = c.s[j], c.s[i]
 	}
 	return &c, nil
diff --git a/src/pkg/crypto/rc4/rc4_386.s b/src/pkg/crypto/rc4/rc4_386.s
new file mode 100644
index 000000000..c80ef2a3a
--- /dev/null
+++ b/src/pkg/crypto/rc4/rc4_386.s
@@ -0,0 +1,51 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
+TEXT ·xorKeyStream(SB),7,$0
+	MOVL dst+0(FP), DI
+	MOVL src+4(FP), SI
+	MOVL state+12(FP), BP
+
+	MOVL i+16(FP), AX
+	MOVBLZX (AX), AX
+	MOVL j+20(FP), BX
+	MOVBLZX (BX), BX
+	CMPL n+8(FP), $0
+	JEQ done
+
+loop:
+	// i += 1
+	INCB AX
+
+	// j += c.s[i]
+	MOVBLZX (BP)(AX*4), DX
+	ADDB DX, BX
+	MOVBLZX BX, BX
+
+	// c.s[i], c.s[j] = c.s[j], c.s[i]
+	MOVBLZX (BP)(BX*4), CX
+	MOVB CX, (BP)(AX*4)
+	MOVB DX, (BP)(BX*4)
+
+	// *dst = *src ^ c.s[c.s[i]+c.s[j]]
+	ADDB DX, CX
+	MOVBLZX CX, CX
+	MOVB (BP)(CX*4), CX
+	XORB (SI), CX
+	MOVBLZX CX, CX
+	MOVB CX, (DI)
+
+	INCL SI
+	INCL DI
+	DECL n+8(FP)
+	JNE loop
+
+done:
+	MOVL i+16(FP), CX
+	MOVB AX, (CX)
+	MOVL j+20(FP), CX
+	MOVB BX, (CX)
+
+	RET
diff --git a/src/pkg/crypto/rc4/rc4_amd64.s b/src/pkg/crypto/rc4/rc4_amd64.s
index ffe9ada85..353fe3720 100644
--- a/src/pkg/crypto/rc4/rc4_amd64.s
+++ b/src/pkg/crypto/rc4/rc4_amd64.s
@@ -1,53 +1,177 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Original source:
+//	http://www.zorinaq.com/papers/rc4-amd64.html
+//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
+
+// Local modifications:
+//
+// Transliterated from GNU to 6a assembly syntax by the Go authors.
+// The comments and spacing are from the original.
+//
+// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
+//
+// The original code accumulated 64 bits of key stream in an integer
+// register and then XOR'ed the key stream into the data 8 bytes at a time.
+// Modified to accumulate 128 bits of key stream into an XMM register
+// and then XOR the key stream into the data 16 bytes at a time.
+// Approximately doubles throughput.
+
+// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
+// but makes the code run 2.0x slower on Xeon.
+#define EXTEND(r) MOVBLZX r, r
+
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The code has been designed to be easily integrated into openssl:
+** the exported RC4() function can replace the actual implementations
+** openssl already contains. Please note that when linking with openssl,
+** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
+** with -DRC4_INT='unsigned long'.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+*/
 
-// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
 TEXT ·xorKeyStream(SB),7,$0
-	MOVQ dst+0(FP), DI
-	MOVQ src+8(FP), SI
-	MOVQ n+16(FP), CX
-	MOVQ state+24(FP), R8
+	MOVQ	n+16(FP),	BX		// rbx = ARG(len)
+	MOVQ	src+8(FP),	SI		// in = ARG(in)
+	MOVQ	dst+0(FP),	DI		// out = ARG(out)
+	MOVQ	state+24(FP),	BP		// d = ARG(data)
+	MOVQ	i+32(FP),	AX
+	MOVBQZX	0(AX),		CX		// x = *xp
+	MOVQ	j+40(FP),	AX
+	MOVBQZX	0(AX),		DX		// y = *yp
+
+	LEAQ	(SI)(BX*1),	R9		// limit = in+len
+
+l1:	CMPQ	SI,		R9		// cmp in with in+len
+	JGE	finished			// jump if (in >= in+len)
+
+	INCB	CX
+	EXTEND(CX)
+	TESTL	$15,		CX
+	JZ	wordloop
+
+	MOVBLZX	(BP)(CX*4),	AX
+
+	ADDB	AX,		DX		// y += tx
+	EXTEND(DX)
+	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
+	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
+	ADDB	AX,		BX		// val = ty+tx
+	EXTEND(BX)
+	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
+	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
+	XORB	(SI),		R8		// xor 1 byte
+	MOVB	R8,		(DI)
+	INCQ	SI				// in++
+	INCQ	DI				// out++
+	JMP l1
+
+wordloop:
+	SUBQ	$16,		R9
+	CMPQ	SI,		R9
+	JGT	end
+
+start:
+	ADDQ	$16,		SI		// increment in
+	ADDQ	$16,		DI		// increment out
+
+	// Each KEYROUND generates one byte of key and
+	// inserts it into an XMM register at the given 16-bit index.
+	// The key state array is uint32 words only using the bottom
+	// byte of each word, so the 16-bit OR only copies 8 useful bits.
+	// We accumulate alternating bytes into X0 and X1, and then at
+	// the end we OR X1<<8 into X0 to produce the actual key.
+	//
+	// At the beginning of the loop, CX%16 == 0, so the 16 loads
+	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
+	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
+	// without fear of the byte computation CX+15 wrapping around.
+	//
+	// The first round needs R12[0], the second needs R12[1], and so on.
+	// We can avoid memory stalls by starting the load for round n+1
+	// before the end of round n, using the LOAD macro.
+	LEAQ	(BP)(CX*4),	R12
 
-	MOVQ xPtr+32(FP), AX
-	MOVBQZX (AX), AX
-	MOVQ yPtr+40(FP), BX
-	MOVBQZX (BX), BX
+#define KEYROUND(xmm, load, off, r1, r2, index) \
+	MOVBLZX	(BP)(DX*4),	R8; \
+	MOVB	r1,		(BP)(DX*4); \
+	load((off+1), r2); \
+	MOVB	R8,		(off*4)(R12); \
+	ADDB	r1,		R8; \
+	EXTEND(R8); \
+	PINSRW	$index, (BP)(R8*4), xmm
 
-loop:
-	CMPQ CX, $0
-	JE done
+#define LOAD(off, reg) \
+	MOVBLZX	(off*4)(R12),	reg; \
+	ADDB	reg,		DX; \
+	EXTEND(DX)
 
-	// c.i += 1
-	INCB AX
+#define SKIP(off, reg)
 
-	// c.j += c.s[c.i]
-	MOVB (R8)(AX*1), R9
-	ADDB R9, BX
+	LOAD(0, AX)
+	KEYROUND(X0, LOAD, 0, AX, BX, 0)
+	KEYROUND(X1, LOAD, 1, BX, AX, 0)
+	KEYROUND(X0, LOAD, 2, AX, BX, 1)
+	KEYROUND(X1, LOAD, 3, BX, AX, 1)
+	KEYROUND(X0, LOAD, 4, AX, BX, 2)
+	KEYROUND(X1, LOAD, 5, BX, AX, 2)
+	KEYROUND(X0, LOAD, 6, AX, BX, 3)
+	KEYROUND(X1, LOAD, 7, BX, AX, 3)
+	KEYROUND(X0, LOAD, 8, AX, BX, 4)
+	KEYROUND(X1, LOAD, 9, BX, AX, 4)
+	KEYROUND(X0, LOAD, 10, AX, BX, 5)
+	KEYROUND(X1, LOAD, 11, BX, AX, 5)
+	KEYROUND(X0, LOAD, 12, AX, BX, 6)
+	KEYROUND(X1, LOAD, 13, BX, AX, 6)
+	KEYROUND(X0, LOAD, 14, AX, BX, 7)
+	KEYROUND(X1, SKIP, 15, BX, AX, 7)
+	
+	ADDB	$16,		CX
 
-	MOVBQZX (R8)(BX*1), R10
+	PSLLQ	$8,		X1
+	PXOR	X1,		X0
+	MOVOU	-16(SI),	X2
+	PXOR	X0,		X2
+	MOVOU	X2,		-16(DI)
 
-	MOVB R10, (R8)(AX*1)
-	MOVB R9, (R8)(BX*1)
+	CMPQ	SI,		R9		// cmp in with in+len-16
+	JLE	start				// jump if (in <= in+len-16)
 
-	// R11 = c.s[c.i]+c.s[c.j]
-	MOVQ R10, R11
-	ADDB R9, R11
+end:
+	DECB	CX
+	ADDQ	$16,		R9		// tmp = in+len
 
-	MOVB (R8)(R11*1), R11
-	MOVB (SI), R12
-	XORB R11, R12
-	MOVB R12, (DI)
+	// handle the last bytes, one by one
+l2:	CMPQ	SI,		R9		// cmp in with in+len
+	JGE	finished			// jump if (in >= in+len)
 
-	INCQ SI
-	INCQ DI
-	DECQ CX
+	INCB	CX
+	EXTEND(CX)
+	MOVBLZX	(BP)(CX*4),	AX
 
-	JMP loop
-done:
-	MOVQ xPtr+32(FP), R8
-	MOVB AX, (R8)
-	MOVQ yPtr+40(FP), R8
-	MOVB BX, (R8)
+	ADDB	AX,		DX		// y += tx
+	EXTEND(DX)
+	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
+	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
+	ADDB	AX,		BX		// val = ty+tx
+	EXTEND(BX)
+	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
+	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
+	XORB	(SI),		R8		// xor 1 byte
+	MOVB	R8,		(DI)
+	INCQ	SI				// in++
+	INCQ	DI				// out++
+	JMP l2
 
+finished:
+	MOVQ	j+40(FP),	BX
+	MOVB	DX, 0(BX)
+	MOVQ	i+32(FP),	AX
+	MOVB	CX, 0(AX)
 	RET
diff --git a/src/pkg/crypto/rc4/rc4_arm.s b/src/pkg/crypto/rc4/rc4_arm.s
index 51a332f62..307cb7148 100644
--- a/src/pkg/crypto/rc4/rc4_arm.s
+++ b/src/pkg/crypto/rc4/rc4_arm.s
@@ -31,19 +31,19 @@ loop:
 	// i += 1; j += state[i]
 	ADD $1, R(i)
 	AND $0xff, R(i)
-	MOVBU R(i)<<0(R(state)), R(t)
+	MOVBU R(i)<<2(R(state)), R(t)
 	ADD R(t), R(j)
 	AND $0xff, R(j)
 
 	// swap state[i] <-> state[j]
-	MOVBU R(j)<<0(R(state)), R(t2)
-	MOVB R(t2), R(i)<<0(R(state))
-	MOVB R(t), R(j)<<0(R(state))
+	MOVBU R(j)<<2(R(state)), R(t2)
+	MOVB R(t2), R(i)<<2(R(state))
+	MOVB R(t), R(j)<<2(R(state))
 
 	// dst[k] = src[k] ^ state[state[i] + state[j]]
 	ADD R(t2), R(t)
 	AND $0xff, R(t)
-	MOVBU R(t)<<0(R(state)), R(t)
+	MOVBU R(t)<<2(R(state)), R(t)
 	MOVBU R(k)<<0(R(src)), R(t2)
 	EOR R(t), R(t2)
 	MOVB R(t2), R(k)<<0(R(dst))
diff --git a/src/pkg/crypto/rc4/rc4_asm.go b/src/pkg/crypto/rc4/rc4_asm.go
index 0b66e4a9e..c582a4488 100644
--- a/src/pkg/crypto/rc4/rc4_asm.go
+++ b/src/pkg/crypto/rc4/rc4_asm.go
@@ -2,11 +2,11 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 arm
+// +build amd64 arm 386
 
 package rc4
 
-func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
+func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)
 
 // XORKeyStream sets dst to the result of XORing src with the key stream.
 // Dst and src may be the same slice but otherwise should not overlap.
diff --git a/src/pkg/crypto/rc4/rc4_ref.go b/src/pkg/crypto/rc4/rc4_ref.go
index 1018548c2..44d380436 100644
--- a/src/pkg/crypto/rc4/rc4_ref.go
+++ b/src/pkg/crypto/rc4/rc4_ref.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!arm
+// +build !amd64,!arm,!386
 
 package rc4
 
diff --git a/src/pkg/crypto/rc4/rc4_test.go b/src/pkg/crypto/rc4/rc4_test.go
index 9e12789f7..7b4df6791 100644
--- a/src/pkg/crypto/rc4/rc4_test.go
+++ b/src/pkg/crypto/rc4/rc4_test.go
@@ -5,6 +5,8 @@
 package rc4
 
 import (
+	"bytes"
+	"fmt"
 	"testing"
 )
 
@@ -72,25 +74,68 @@ var golden = []rc4Test{
 	},
 }
 
+func testEncrypt(t *testing.T, desc string, c *Cipher, src, expect []byte) {
+	dst := make([]byte, len(src))
+	c.XORKeyStream(dst, src)
+	for i, v := range dst {
+		if v != expect[i] {
+			t.Fatalf("%s: mismatch at byte %d:\nhave %x\nwant %x", desc, i, dst, expect)
+		}
+	}
+}
+
 func TestGolden(t *testing.T) {
-	for i := 0; i < len(golden); i++ {
-		g := golden[i]
-		c, err := NewCipher(g.key)
-		if err != nil {
-			t.Errorf("Failed to create cipher at golden index %d", i)
-			return
+	for gi, g := range golden {
+		data := make([]byte, len(g.keystream))
+		for i := range data {
+			data[i] = byte(i)
 		}
-		keystream := make([]byte, len(g.keystream))
-		c.XORKeyStream(keystream, keystream)
-		for j, v := range keystream {
-			if g.keystream[j] != v {
-				t.Errorf("Failed at golden index %d:\n%x\nvs\n%x", i, keystream, g.keystream)
-				break
+
+		expect := make([]byte, len(g.keystream))
+		for i := range expect {
+			expect[i] = byte(i) ^ g.keystream[i]
+		}
+
+		for size := 1; size <= len(g.keystream); size++ {
+			c, err := NewCipher(g.key)
+			if err != nil {
+				t.Fatalf("#%d: NewCipher: %v", gi, err)
+			}
+
+			off := 0
+			for off < len(g.keystream) {
+				n := len(g.keystream) - off
+				if n > size {
+					n = size
+				}
+				desc := fmt.Sprintf("#%d@[%d:%d]", gi, off, off+n)
+				testEncrypt(t, desc, c, data[off:off+n], expect[off:off+n])
+				off += n
 			}
 		}
 	}
 }
 
+func TestBlock(t *testing.T) {
+	c1a, _ := NewCipher(golden[0].key)
+	c1b, _ := NewCipher(golden[1].key)
+	data1 := make([]byte, 1<<20)
+	for i := range data1 {
+		c1a.XORKeyStream(data1[i:i+1], data1[i:i+1])
+		c1b.XORKeyStream(data1[i:i+1], data1[i:i+1])
+	}
+
+	c2a, _ := NewCipher(golden[0].key)
+	c2b, _ := NewCipher(golden[1].key)
+	data2 := make([]byte, 1<<20)
+	c2a.XORKeyStream(data2, data2)
+	c2b.XORKeyStream(data2, data2)
+
+	if !bytes.Equal(data1, data2) {
+		t.Fatalf("bad block")
+	}
+}
+
 func benchmark(b *testing.B, size int64) {
 	buf := make([]byte, size)
 	c, err := NewCipher(golden[0].key)
diff --git a/src/pkg/crypto/rsa/rsa.go b/src/pkg/crypto/rsa/rsa.go
index 35a5f7c3c..f56fb37ee 100644
--- a/src/pkg/crypto/rsa/rsa.go
+++ b/src/pkg/crypto/rsa/rsa.go
@@ -203,7 +203,9 @@ NextSetOfPrimes:
 		g.GCD(priv.D, y, e, totient)
 
 		if g.Cmp(bigOne) == 0 {
-			priv.D.Add(priv.D, totient)
+			if priv.D.Sign() < 0 {
+				priv.D.Add(priv.D, totient)
+			}
 			priv.Primes = primes
 			priv.N = n
 
diff --git a/src/pkg/crypto/rsa/rsa_test.go b/src/pkg/crypto/rsa/rsa_test.go
index f08cfe73c..ffd96e62f 100644
--- a/src/pkg/crypto/rsa/rsa_test.go
+++ b/src/pkg/crypto/rsa/rsa_test.go
@@ -93,6 +93,9 @@ func testKeyBasics(t *testing.T, priv *PrivateKey) {
 	if err := priv.Validate(); err != nil {
 		t.Errorf("Validate() failed: %s", err)
 	}
+	if priv.D.Cmp(priv.N) > 0 {
+		t.Errorf("private exponent too large")
+	}
 
 	pub := &priv.PublicKey
 	m := big.NewInt(42)
diff --git a/src/pkg/crypto/sha1/sha1block.go b/src/pkg/crypto/sha1/sha1block.go
index 1c9507c68..92224fc0e 100644
--- a/src/pkg/crypto/sha1/sha1block.go
+++ b/src/pkg/crypto/sha1/sha1block.go
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !amd64,!386
+
 // SHA1 block step.
 // In its own file so that a faster assembly or C version
 // can be substituted easily.
diff --git a/src/pkg/crypto/sha1/sha1block_386.s b/src/pkg/crypto/sha1/sha1block_386.s
new file mode 100644
index 000000000..e60a7b9b0
--- /dev/null
+++ b/src/pkg/crypto/sha1/sha1block_386.s
@@ -0,0 +1,233 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// SHA1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
+//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+
+// Like sha1block_amd64.s, but we keep the data and limit pointers on the stack.
+// To free up the word pointer (R10 on amd64, DI here), we add it to e during
+// LOAD/SHUFFLE instead of during MIX.
+//
+// The stack holds the intermediate word array - 16 uint32s - at 0(SP) up to 64(SP).
+// The saved a, b, c, d, e (R11 through R15 on amd64) are at 64(SP) up to 84(SP).
+// The saved limit pointer (DI on amd64) is at 84(SP).
+// The saved data pointer (SI on amd64) is at 88(SP).
+
+#define LOAD(index, e) \
+	MOVL	88(SP), SI; \
+	MOVL	(index*4)(SI), DI; \
+	BSWAPL	DI; \
+	MOVL	DI, (index*4)(SP); \
+	ADDL	DI, e
+
+#define SHUFFLE(index, e) \
+	MOVL	(((index)&0xf)*4)(SP), DI; \
+	XORL	(((index-3)&0xf)*4)(SP), DI; \
+	XORL	(((index-8)&0xf)*4)(SP), DI; \
+	XORL	(((index-14)&0xf)*4)(SP), DI; \
+	ROLL	$1, DI; \
+	MOVL	DI, (((index)&0xf)*4)(SP); \
+	ADDL	DI, e
+
+#define FUNC1(a, b, c, d, e) \
+	MOVL	b, SI; \
+	ANDL	c, SI; \
+	MOVL	b, DI; \
+	NOTL	DI; \
+	ANDL	d, DI; \
+	ORL	SI, DI
+
+#define FUNC2(a, b, c, d, e) \
+	MOVL	b, DI; \
+	XORL	c, DI; \
+	XORL	d, DI
+
+#define FUNC3(a, b, c, d, e) \
+	MOVL	b, SI; \
+	ORL	c, SI; \
+	ANDL	d, SI; \
+	MOVL	b, DI; \
+	ANDL	c, DI; \
+	ORL	SI, DI
+
+#define FUNC4 FUNC2
+
+#define MIX(a, b, c, d, e, const) \
+	ROLL	$30, b; \
+	ADDL	DI, e; \
+	MOVL	a, SI; \
+	ROLL	$5, SI; \
+	LEAL	const(e)(SI*1), e
+
+#define ROUND1(a, b, c, d, e, index) \
+	LOAD(index, e); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND1x(a, b, c, d, e, index) \
+	SHUFFLE(index, e); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND2(a, b, c, d, e, index) \
+	SHUFFLE(index, e); \
+	FUNC2(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x6ED9EBA1)
+
+#define ROUND3(a, b, c, d, e, index) \
+	SHUFFLE(index, e); \
+	FUNC3(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x8F1BBCDC)
+
+#define ROUND4(a, b, c, d, e, index) \
+	SHUFFLE(index, e); \
+	FUNC4(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0xCA62C1D6)
+
+// func block(dig *digest, p []byte)
+TEXT ·block(SB),7,$92-16
+	MOVL	dig+0(FP),	BP
+	MOVL	p+4(FP),	SI
+	MOVL	p_len+8(FP),	DX
+	SHRL	$6,		DX
+	SHLL	$6,		DX
+	
+	LEAL	(SI)(DX*1),	DI
+	MOVL	(0*4)(BP),	AX
+	MOVL	(1*4)(BP),	BX
+	MOVL	(2*4)(BP),	CX
+	MOVL	(3*4)(BP),	DX
+	MOVL	(4*4)(BP),	BP
+
+	CMPL	SI,		DI
+	JEQ	end
+
+	MOVL	DI,	84(SP)
+
+loop:
+	MOVL	SI,	88(SP)
+
+	MOVL	AX,	64(SP)
+	MOVL	BX,	68(SP)
+	MOVL	CX,	72(SP)
+	MOVL	DX,	76(SP)
+	MOVL	BP,	80(SP)
+
+	ROUND1(AX, BX, CX, DX, BP, 0)
+	ROUND1(BP, AX, BX, CX, DX, 1)
+	ROUND1(DX, BP, AX, BX, CX, 2)
+	ROUND1(CX, DX, BP, AX, BX, 3)
+	ROUND1(BX, CX, DX, BP, AX, 4)
+	ROUND1(AX, BX, CX, DX, BP, 5)
+	ROUND1(BP, AX, BX, CX, DX, 6)
+	ROUND1(DX, BP, AX, BX, CX, 7)
+	ROUND1(CX, DX, BP, AX, BX, 8)
+	ROUND1(BX, CX, DX, BP, AX, 9)
+	ROUND1(AX, BX, CX, DX, BP, 10)
+	ROUND1(BP, AX, BX, CX, DX, 11)
+	ROUND1(DX, BP, AX, BX, CX, 12)
+	ROUND1(CX, DX, BP, AX, BX, 13)
+	ROUND1(BX, CX, DX, BP, AX, 14)
+	ROUND1(AX, BX, CX, DX, BP, 15)
+
+	ROUND1x(BP, AX, BX, CX, DX, 16)
+	ROUND1x(DX, BP, AX, BX, CX, 17)
+	ROUND1x(CX, DX, BP, AX, BX, 18)
+	ROUND1x(BX, CX, DX, BP, AX, 19)
+	
+	ROUND2(AX, BX, CX, DX, BP, 20)
+	ROUND2(BP, AX, BX, CX, DX, 21)
+	ROUND2(DX, BP, AX, BX, CX, 22)
+	ROUND2(CX, DX, BP, AX, BX, 23)
+	ROUND2(BX, CX, DX, BP, AX, 24)
+	ROUND2(AX, BX, CX, DX, BP, 25)
+	ROUND2(BP, AX, BX, CX, DX, 26)
+	ROUND2(DX, BP, AX, BX, CX, 27)
+	ROUND2(CX, DX, BP, AX, BX, 28)
+	ROUND2(BX, CX, DX, BP, AX, 29)
+	ROUND2(AX, BX, CX, DX, BP, 30)
+	ROUND2(BP, AX, BX, CX, DX, 31)
+	ROUND2(DX, BP, AX, BX, CX, 32)
+	ROUND2(CX, DX, BP, AX, BX, 33)
+	ROUND2(BX, CX, DX, BP, AX, 34)
+	ROUND2(AX, BX, CX, DX, BP, 35)
+	ROUND2(BP, AX, BX, CX, DX, 36)
+	ROUND2(DX, BP, AX, BX, CX, 37)
+	ROUND2(CX, DX, BP, AX, BX, 38)
+	ROUND2(BX, CX, DX, BP, AX, 39)
+	
+	ROUND3(AX, BX, CX, DX, BP, 40)
+	ROUND3(BP, AX, BX, CX, DX, 41)
+	ROUND3(DX, BP, AX, BX, CX, 42)
+	ROUND3(CX, DX, BP, AX, BX, 43)
+	ROUND3(BX, CX, DX, BP, AX, 44)
+	ROUND3(AX, BX, CX, DX, BP, 45)
+	ROUND3(BP, AX, BX, CX, DX, 46)
+	ROUND3(DX, BP, AX, BX, CX, 47)
+	ROUND3(CX, DX, BP, AX, BX, 48)
+	ROUND3(BX, CX, DX, BP, AX, 49)
+	ROUND3(AX, BX, CX, DX, BP, 50)
+	ROUND3(BP, AX, BX, CX, DX, 51)
+	ROUND3(DX, BP, AX, BX, CX, 52)
+	ROUND3(CX, DX, BP, AX, BX, 53)
+	ROUND3(BX, CX, DX, BP, AX, 54)
+	ROUND3(AX, BX, CX, DX, BP, 55)
+	ROUND3(BP, AX, BX, CX, DX, 56)
+	ROUND3(DX, BP, AX, BX, CX, 57)
+	ROUND3(CX, DX, BP, AX, BX, 58)
+	ROUND3(BX, CX, DX, BP, AX, 59)
+	
+	ROUND4(AX, BX, CX, DX, BP, 60)
+	ROUND4(BP, AX, BX, CX, DX, 61)
+	ROUND4(DX, BP, AX, BX, CX, 62)
+	ROUND4(CX, DX, BP, AX, BX, 63)
+	ROUND4(BX, CX, DX, BP, AX, 64)
+	ROUND4(AX, BX, CX, DX, BP, 65)
+	ROUND4(BP, AX, BX, CX, DX, 66)
+	ROUND4(DX, BP, AX, BX, CX, 67)
+	ROUND4(CX, DX, BP, AX, BX, 68)
+	ROUND4(BX, CX, DX, BP, AX, 69)
+	ROUND4(AX, BX, CX, DX, BP, 70)
+	ROUND4(BP, AX, BX, CX, DX, 71)
+	ROUND4(DX, BP, AX, BX, CX, 72)
+	ROUND4(CX, DX, BP, AX, BX, 73)
+	ROUND4(BX, CX, DX, BP, AX, 74)
+	ROUND4(AX, BX, CX, DX, BP, 75)
+	ROUND4(BP, AX, BX, CX, DX, 76)
+	ROUND4(DX, BP, AX, BX, CX, 77)
+	ROUND4(CX, DX, BP, AX, BX, 78)
+	ROUND4(BX, CX, DX, BP, AX, 79)
+
+	ADDL	64(SP), AX
+	ADDL	68(SP), BX
+	ADDL	72(SP), CX
+	ADDL	76(SP), DX
+	ADDL	80(SP), BP
+
+	MOVL	88(SP), SI
+	ADDL	$64, SI
+	CMPL	SI, 84(SP)
+	JB	loop
+
+end:
+	MOVL	dig+0(FP), DI
+	MOVL	AX, (0*4)(DI)
+	MOVL	BX, (1*4)(DI)
+	MOVL	CX, (2*4)(DI)
+	MOVL	DX, (3*4)(DI)
+	MOVL	BP, (4*4)(DI)
+	RET
diff --git a/src/pkg/crypto/sha1/sha1block_amd64.s b/src/pkg/crypto/sha1/sha1block_amd64.s
new file mode 100644
index 000000000..452578aa4
--- /dev/null
+++ b/src/pkg/crypto/sha1/sha1block_amd64.s
@@ -0,0 +1,216 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// SHA1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
+//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+
+#define LOAD(index) \
+	MOVL	(index*4)(SI), R10; \
+	BSWAPL	R10; \
+	MOVL	R10, (index*4)(SP)
+
+#define SHUFFLE(index) \
+	MOVL	(((index)&0xf)*4)(SP), R10; \
+	XORL	(((index-3)&0xf)*4)(SP), R10; \
+	XORL	(((index-8)&0xf)*4)(SP), R10; \
+	XORL	(((index-14)&0xf)*4)(SP), R10; \
+	ROLL	$1, R10; \
+	MOVL	R10, (((index)&0xf)*4)(SP)
+
+#define FUNC1(a, b, c, d, e) \
+	MOVL	b, R8; \
+	ANDL	c, R8; \
+	MOVL	b, R9; \
+	NOTL	R9; \
+	ANDL	d, R9; \
+	ORL	R8, R9
+
+#define FUNC2(a, b, c, d, e) \
+	MOVL	b, R9; \
+	XORL	c, R9; \
+	XORL	d, R9
+
+#define FUNC3(a, b, c, d, e) \
+	MOVL	b, R8; \
+	ORL	c, R8; \
+	ANDL	d, R8; \
+	MOVL	b, R9; \
+	ANDL	c, R9; \
+	ORL	R8, R9
+	
+#define FUNC4 FUNC2
+
+#define MIX(a, b, c, d, e, const) \
+	ROLL	$30, b; \
+	ADDL	R9, e; \
+	MOVL	a, R8; \
+	ROLL	$5, R8; \
+	LEAL	const(e)(R10*1), e; \
+	ADDL	R8, e
+
+#define ROUND1(a, b, c, d, e, index) \
+	LOAD(index); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND1x(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND2(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC2(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x6ED9EBA1)
+
+#define ROUND3(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC3(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x8F1BBCDC)
+
+#define ROUND4(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC4(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0xCA62C1D6)
+
+TEXT ·block(SB),7,$64-32
+	MOVQ	dig+0(FP),	BP
+	MOVQ	p_base+8(FP),	SI
+	MOVQ	p_len+16(FP),	DX
+	SHRQ	$6,		DX
+	SHLQ	$6,		DX
+	
+	LEAQ	(SI)(DX*1),	DI
+	MOVL	(0*4)(BP),	AX
+	MOVL	(1*4)(BP),	BX
+	MOVL	(2*4)(BP),	CX
+	MOVL	(3*4)(BP),	DX
+	MOVL	(4*4)(BP),	BP
+
+	CMPQ	SI,		DI
+	JEQ	end
+
+loop:
+	MOVL	AX,	R11
+	MOVL	BX,	R12
+	MOVL	CX,	R13
+	MOVL	DX,	R14
+	MOVL	BP,	R15
+
+	ROUND1(AX, BX, CX, DX, BP, 0)
+	ROUND1(BP, AX, BX, CX, DX, 1)
+	ROUND1(DX, BP, AX, BX, CX, 2)
+	ROUND1(CX, DX, BP, AX, BX, 3)
+	ROUND1(BX, CX, DX, BP, AX, 4)
+	ROUND1(AX, BX, CX, DX, BP, 5)
+	ROUND1(BP, AX, BX, CX, DX, 6)
+	ROUND1(DX, BP, AX, BX, CX, 7)
+	ROUND1(CX, DX, BP, AX, BX, 8)
+	ROUND1(BX, CX, DX, BP, AX, 9)
+	ROUND1(AX, BX, CX, DX, BP, 10)
+	ROUND1(BP, AX, BX, CX, DX, 11)
+	ROUND1(DX, BP, AX, BX, CX, 12)
+	ROUND1(CX, DX, BP, AX, BX, 13)
+	ROUND1(BX, CX, DX, BP, AX, 14)
+	ROUND1(AX, BX, CX, DX, BP, 15)
+
+	ROUND1x(BP, AX, BX, CX, DX, 16)
+	ROUND1x(DX, BP, AX, BX, CX, 17)
+	ROUND1x(CX, DX, BP, AX, BX, 18)
+	ROUND1x(BX, CX, DX, BP, AX, 19)
+	
+	ROUND2(AX, BX, CX, DX, BP, 20)
+	ROUND2(BP, AX, BX, CX, DX, 21)
+	ROUND2(DX, BP, AX, BX, CX, 22)
+	ROUND2(CX, DX, BP, AX, BX, 23)
+	ROUND2(BX, CX, DX, BP, AX, 24)
+	ROUND2(AX, BX, CX, DX, BP, 25)
+	ROUND2(BP, AX, BX, CX, DX, 26)
+	ROUND2(DX, BP, AX, BX, CX, 27)
+	ROUND2(CX, DX, BP, AX, BX, 28)
+	ROUND2(BX, CX, DX, BP, AX, 29)
+	ROUND2(AX, BX, CX, DX, BP, 30)
+	ROUND2(BP, AX, BX, CX, DX, 31)
+	ROUND2(DX, BP, AX, BX, CX, 32)
+	ROUND2(CX, DX, BP, AX, BX, 33)
+	ROUND2(BX, CX, DX, BP, AX, 34)
+	ROUND2(AX, BX, CX, DX, BP, 35)
+	ROUND2(BP, AX, BX, CX, DX, 36)
+	ROUND2(DX, BP, AX, BX, CX, 37)
+	ROUND2(CX, DX, BP, AX, BX, 38)
+	ROUND2(BX, CX, DX, BP, AX, 39)
+	
+	ROUND3(AX, BX, CX, DX, BP, 40)
+	ROUND3(BP, AX, BX, CX, DX, 41)
+	ROUND3(DX, BP, AX, BX, CX, 42)
+	ROUND3(CX, DX, BP, AX, BX, 43)
+	ROUND3(BX, CX, DX, BP, AX, 44)
+	ROUND3(AX, BX, CX, DX, BP, 45)
+	ROUND3(BP, AX, BX, CX, DX, 46)
+	ROUND3(DX, BP, AX, BX, CX, 47)
+	ROUND3(CX, DX, BP, AX, BX, 48)
+	ROUND3(BX, CX, DX, BP, AX, 49)
+	ROUND3(AX, BX, CX, DX, BP, 50)
+	ROUND3(BP, AX, BX, CX, DX, 51)
+	ROUND3(DX, BP, AX, BX, CX, 52)
+	ROUND3(CX, DX, BP, AX, BX, 53)
+	ROUND3(BX, CX, DX, BP, AX, 54)
+	ROUND3(AX, BX, CX, DX, BP, 55)
+	ROUND3(BP, AX, BX, CX, DX, 56)
+	ROUND3(DX, BP, AX, BX, CX, 57)
+	ROUND3(CX, DX, BP, AX, BX, 58)
+	ROUND3(BX, CX, DX, BP, AX, 59)
+	
+	ROUND4(AX, BX, CX, DX, BP, 60)
+	ROUND4(BP, AX, BX, CX, DX, 61)
+	ROUND4(DX, BP, AX, BX, CX, 62)
+	ROUND4(CX, DX, BP, AX, BX, 63)
+	ROUND4(BX, CX, DX, BP, AX, 64)
+	ROUND4(AX, BX, CX, DX, BP, 65)
+	ROUND4(BP, AX, BX, CX, DX, 66)
+	ROUND4(DX, BP, AX, BX, CX, 67)
+	ROUND4(CX, DX, BP, AX, BX, 68)
+	ROUND4(BX, CX, DX, BP, AX, 69)
+	ROUND4(AX, BX, CX, DX, BP, 70)
+	ROUND4(BP, AX, BX, CX, DX, 71)
+	ROUND4(DX, BP, AX, BX, CX, 72)
+	ROUND4(CX, DX, BP, AX, BX, 73)
+	ROUND4(BX, CX, DX, BP, AX, 74)
+	ROUND4(AX, BX, CX, DX, BP, 75)
+	ROUND4(BP, AX, BX, CX, DX, 76)
+	ROUND4(DX, BP, AX, BX, CX, 77)
+	ROUND4(CX, DX, BP, AX, BX, 78)
+	ROUND4(BX, CX, DX, BP, AX, 79)
+
+	ADDL	R11, AX
+	ADDL	R12, BX
+	ADDL	R13, CX
+	ADDL	R14, DX
+	ADDL	R15, BP
+
+	ADDQ	$64, SI
+	CMPQ	SI, DI
+	JB	loop
+
+end:
+	MOVQ	dig+0(FP), DI
+	MOVL	AX, (0*4)(DI)
+	MOVL	BX, (1*4)(DI)
+	MOVL	CX, (2*4)(DI)
+	MOVL	DX, (3*4)(DI)
+	MOVL	BP, (4*4)(DI)
+	RET
diff --git a/src/pkg/crypto/sha1/sha1block_decl.go b/src/pkg/crypto/sha1/sha1block_decl.go
new file mode 100644
index 000000000..3512a5829
--- /dev/null
+++ b/src/pkg/crypto/sha1/sha1block_decl.go
@@ -0,0 +1,9 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 386
+
+package sha1
+
+func block(dig *digest, p []byte)
diff --git a/src/pkg/crypto/tls/common.go b/src/pkg/crypto/tls/common.go
index a888df762..f86c90de7 100644
--- a/src/pkg/crypto/tls/common.go
+++ b/src/pkg/crypto/tls/common.go
@@ -204,7 +204,24 @@ type Config struct {
 	// connections using that key are compromised.
 	SessionTicketKey [32]byte
 
-	serverInitOnce sync.Once
+	serverInitOnce sync.Once // guards calling (*Config).serverInit
+}
+
+func (c *Config) serverInit() {
+	if c.SessionTicketsDisabled {
+		return
+	}
+
+	// If the key has already been set then we have nothing to do.
+	for _, b := range c.SessionTicketKey {
+		if b != 0 {
+			return
+		}
+	}
+
+	if _, err := io.ReadFull(c.rand(), c.SessionTicketKey[:]); err != nil {
+		c.SessionTicketsDisabled = true
+	}
 }
 
 func (c *Config) rand() io.Reader {
diff --git a/src/pkg/crypto/tls/handshake_server.go b/src/pkg/crypto/tls/handshake_server.go
index 730991016..823730c60 100644
--- a/src/pkg/crypto/tls/handshake_server.go
+++ b/src/pkg/crypto/tls/handshake_server.go
@@ -33,22 +33,7 @@ func (c *Conn) serverHandshake() error {
 
 	// If this is the first server handshake, we generate a random key to
 	// encrypt the tickets with.
-	config.serverInitOnce.Do(func() {
-		if config.SessionTicketsDisabled {
-			return
-		}
-
-		// If the key has already been set then we have nothing to do.
-		for _, b := range config.SessionTicketKey {
-			if b != 0 {
-				return
-			}
-		}
-
-		if _, err := io.ReadFull(config.rand(), config.SessionTicketKey[:]); err != nil {
-			config.SessionTicketsDisabled = true
-		}
-	})
+	config.serverInitOnce.Do(config.serverInit)
 
 	hs := serverHandshakeState{
 		c: c,
diff --git a/src/pkg/crypto/x509/pkcs8.go b/src/pkg/crypto/x509/pkcs8.go
index 30caacb3c..8e1585e15 100644
--- a/src/pkg/crypto/x509/pkcs8.go
+++ b/src/pkg/crypto/x509/pkcs8.go
@@ -51,6 +51,4 @@ func ParsePKCS8PrivateKey(der []byte) (key interface{}, err error) {
 	default:
 		return nil, fmt.Errorf("crypto/x509: PKCS#8 wrapping contained private key with unknown algorithm: %v", privKey.Algo.Algorithm)
 	}
-
-	panic("unreachable")
 }
diff --git a/src/pkg/crypto/x509/x509.go b/src/pkg/crypto/x509/x509.go
index b802bf4eb..4dfea2c94 100644
--- a/src/pkg/crypto/x509/x509.go
+++ b/src/pkg/crypto/x509/x509.go
@@ -729,7 +729,6 @@ func parsePublicKey(algo PublicKeyAlgorithm, keyData *publicKeyInfo) (interface{
 	default:
 		return nil, nil
 	}
-	panic("unreachable")
 }
 
 func parseCertificate(in *certificate) (*Certificate, error) {