diff options
Diffstat (limited to 'src/pkg/crypto/rc4')
-rw-r--r-- | src/pkg/crypto/rc4/rc4.go | 17 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_amd64p32.s | 192 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_asm.go | 2 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_ref.go | 11 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_test.go | 19 |
5 files changed, 227 insertions, 14 deletions
diff --git a/src/pkg/crypto/rc4/rc4.go b/src/pkg/crypto/rc4/rc4.go index 3d717c63b..9acb681bf 100644 --- a/src/pkg/crypto/rc4/rc4.go +++ b/src/pkg/crypto/rc4/rc4.go @@ -50,3 +50,20 @@ func (c *Cipher) Reset() { } c.i, c.j = 0, 0 } + +// xorKeyStreamGeneric sets dst to the result of XORing src with the +// key stream. Dst and src may be the same slice but otherwise should +// not overlap. +// +// This is the pure Go version. rc4_{amd64,386,arm}* contain assembly +// implementations. This is here for tests and to prevent bitrot. +func (c *Cipher) xorKeyStreamGeneric(dst, src []byte) { + i, j := c.i, c.j + for k, v := range src { + i += 1 + j += uint8(c.s[i]) + c.s[i], c.s[j] = c.s[j], c.s[i] + dst[k] = v ^ uint8(c.s[uint8(c.s[i]+c.s[j])]) + } + c.i, c.j = i, j +} diff --git a/src/pkg/crypto/rc4/rc4_amd64p32.s b/src/pkg/crypto/rc4/rc4_amd64p32.s new file mode 100644 index 000000000..27d849507 --- /dev/null +++ b/src/pkg/crypto/rc4/rc4_amd64p32.s @@ -0,0 +1,192 @@ +// Original source: +// http://www.zorinaq.com/papers/rc4-amd64.html +// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2 + +#include "../../../cmd/ld/textflag.h" + +// Local modifications: +// +// Transliterated from GNU to 6a assembly syntax by the Go authors. +// The comments and spacing are from the original. +// +// The new EXTEND macros avoid a bad stall on some systems after 8-bit math. +// +// The original code accumulated 64 bits of key stream in an integer +// register and then XOR'ed the key stream into the data 8 bytes at a time. +// Modified to accumulate 128 bits of key stream into an XMM register +// and then XOR the key stream into the data 16 bytes at a time. +// Approximately doubles throughput. +// +// Converted to amd64p32. +// +// To make safe for Native Client, avoid use of BP, R15, +// and two-register addressing modes. + +// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5 +// but makes the code run 2.0x slower on Xeon. +#define EXTEND(r) MOVBLZX r, r + +/* +** RC4 implementation optimized for AMD64. +** +** Author: Marc Bevand <bevand_m (at) epita.fr> +** Licence: I hereby disclaim the copyright on this code and place it +** in the public domain. +** +** The code has been designed to be easily integrated into openssl: +** the exported RC4() function can replace the actual implementations +** openssl already contains. Please note that when linking with openssl, +** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled +** with -DRC4_INT='unsigned long'. +** +** The throughput achieved by this code is about 320 MBytes/sec, on +** a 1.8 GHz AMD Opteron (rev C0) processor. +*/ + +TEXT ·xorKeyStream(SB),NOSPLIT,$0 + MOVL n+8(FP), BX // rbx = ARG(len) + MOVL src+4(FP), SI // in = ARG(in) + MOVL dst+0(FP), DI // out = ARG(out) + MOVL state+12(FP), R10 // d = ARG(data) + MOVL i+16(FP), AX + MOVBQZX 0(AX), CX // x = *xp + MOVL j+20(FP), AX + MOVBQZX 0(AX), DX // y = *yp + + LEAQ (SI)(BX*1), R9 // limit = in+len + +l1: CMPQ SI, R9 // cmp in with in+len + JGE finished // jump if (in >= in+len) + + INCB CX + EXTEND(CX) + TESTL $15, CX + JZ wordloop + LEAL (R10)(CX*4), R12 + + MOVBLZX (R12), AX + + ADDB AX, DX // y += tx + EXTEND(DX) + LEAL (R10)(DX*4), R11 + MOVBLZX (R11), BX // ty = d[y] + MOVB BX, (R12) // d[x] = ty + ADDB AX, BX // val = ty+tx + EXTEND(BX) + LEAL (R10)(BX*4), R13 + MOVB AX, (R11) // d[y] = tx + MOVBLZX (R13), R8 // val = d[val] + XORB (SI), R8 // xor 1 byte + MOVB R8, (DI) + INCQ SI // in++ + INCQ DI // out++ + JMP l1 + +wordloop: + SUBQ $16, R9 + CMPQ SI, R9 + JGT end + +start: + ADDQ $16, SI // increment in + ADDQ $16, DI // increment out + + // Each KEYROUND generates one byte of key and + // inserts it into an XMM register at the given 16-bit index. + // The key state array is uint32 words only using the bottom + // byte of each word, so the 16-bit OR only copies 8 useful bits. + // We accumulate alternating bytes into X0 and X1, and then at + // the end we OR X1<<8 into X0 to produce the actual key. + // + // At the beginning of the loop, CX%16 == 0, so the 16 loads + // at state[CX], state[CX+1], ..., state[CX+15] can precompute + // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15], + // without fear of the byte computation CX+15 wrapping around. + // + // The first round needs R12[0], the second needs R12[1], and so on. + // We can avoid memory stalls by starting the load for round n+1 + // before the end of round n, using the LOAD macro. + LEAQ (R10)(CX*4), R12 + +#define KEYROUND(xmm, load, off, r1, r2, index) \ + LEAL (R10)(DX*4), R11; \ + MOVBLZX (R11), R8; \ + MOVB r1, (R11); \ + load((off+1), r2); \ + MOVB R8, (off*4)(R12); \ + ADDB r1, R8; \ + EXTEND(R8); \ + LEAL (R10)(R8*4), R14; \ + PINSRW $index, (R14), xmm + +#define LOAD(off, reg) \ + MOVBLZX (off*4)(R12), reg; \ + ADDB reg, DX; \ + EXTEND(DX) + +#define SKIP(off, reg) + + LOAD(0, AX) + KEYROUND(X0, LOAD, 0, AX, BX, 0) + KEYROUND(X1, LOAD, 1, BX, AX, 0) + KEYROUND(X0, LOAD, 2, AX, BX, 1) + KEYROUND(X1, LOAD, 3, BX, AX, 1) + KEYROUND(X0, LOAD, 4, AX, BX, 2) + KEYROUND(X1, LOAD, 5, BX, AX, 2) + KEYROUND(X0, LOAD, 6, AX, BX, 3) + KEYROUND(X1, LOAD, 7, BX, AX, 3) + KEYROUND(X0, LOAD, 8, AX, BX, 4) + KEYROUND(X1, LOAD, 9, BX, AX, 4) + KEYROUND(X0, LOAD, 10, AX, BX, 5) + KEYROUND(X1, LOAD, 11, BX, AX, 5) + KEYROUND(X0, LOAD, 12, AX, BX, 6) + KEYROUND(X1, LOAD, 13, BX, AX, 6) + KEYROUND(X0, LOAD, 14, AX, BX, 7) + KEYROUND(X1, SKIP, 15, BX, AX, 7) + + ADDB $16, CX + + PSLLQ $8, X1 + PXOR X1, X0 + MOVOU -16(SI), X2 + PXOR X0, X2 + MOVOU X2, -16(DI) + + CMPQ SI, R9 // cmp in with in+len-16 + JLE start // jump if (in <= in+len-16) + +end: + DECB CX + ADDQ $16, R9 // tmp = in+len + + // handle the last bytes, one by one +l2: CMPQ SI, R9 // cmp in with in+len + JGE finished // jump if (in >= in+len) + + INCB CX + EXTEND(CX) + LEAL (R10)(CX*4), R12 + MOVBLZX (R12), AX + + ADDB AX, DX // y += tx + EXTEND(DX) + LEAL (R10)(DX*4), R11 + MOVBLZX (R11), BX // ty = d[y] + MOVB BX, (R12) // d[x] = ty + ADDB AX, BX // val = ty+tx + EXTEND(BX) + LEAL (R10)(BX*4), R13 + MOVB AX, (R11) // d[y] = tx + MOVBLZX (R13), R8 // val = d[val] + XORB (SI), R8 // xor 1 byte + MOVB R8, (DI) + INCQ SI // in++ + INCQ DI // out++ + JMP l2 + +finished: + MOVL j+20(FP), BX + MOVB DX, 0(BX) + MOVL i+16(FP), AX + MOVB CX, 0(AX) + RET diff --git a/src/pkg/crypto/rc4/rc4_asm.go b/src/pkg/crypto/rc4/rc4_asm.go index c582a4488..fc71b9a6f 100644 --- a/src/pkg/crypto/rc4/rc4_asm.go +++ b/src/pkg/crypto/rc4/rc4_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 arm 386 +// +build amd64 amd64p32 arm 386 package rc4 diff --git a/src/pkg/crypto/rc4/rc4_ref.go b/src/pkg/crypto/rc4/rc4_ref.go index 44d380436..1ecce1a7f 100644 --- a/src/pkg/crypto/rc4/rc4_ref.go +++ b/src/pkg/crypto/rc4/rc4_ref.go @@ -2,19 +2,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!arm,!386 +// +build !amd64,!amd64p32,!arm,!386 package rc4 // XORKeyStream sets dst to the result of XORing src with the key stream. // Dst and src may be the same slice but otherwise should not overlap. func (c *Cipher) XORKeyStream(dst, src []byte) { - i, j := c.i, c.j - for k, v := range src { - i += 1 - j += c.s[i] - c.s[i], c.s[j] = c.s[j], c.s[i] - dst[k] = v ^ c.s[c.s[i]+c.s[j]] - } - c.i, c.j = i, j + c.xorKeyStreamGeneric(dst, src) } diff --git a/src/pkg/crypto/rc4/rc4_test.go b/src/pkg/crypto/rc4/rc4_test.go index 7b4df6791..af7988246 100644 --- a/src/pkg/crypto/rc4/rc4_test.go +++ b/src/pkg/crypto/rc4/rc4_test.go @@ -117,19 +117,30 @@ func TestGolden(t *testing.T) { } func TestBlock(t *testing.T) { + testBlock(t, (*Cipher).XORKeyStream) +} + +// Test the pure Go version. +// Because we have assembly for amd64, 386, and arm, this prevents +// bitrot of the reference implementations. +func TestBlockGeneric(t *testing.T) { + testBlock(t, (*Cipher).xorKeyStreamGeneric) +} + +func testBlock(t *testing.T, xor func(c *Cipher, dst, src []byte)) { c1a, _ := NewCipher(golden[0].key) c1b, _ := NewCipher(golden[1].key) data1 := make([]byte, 1<<20) for i := range data1 { - c1a.XORKeyStream(data1[i:i+1], data1[i:i+1]) - c1b.XORKeyStream(data1[i:i+1], data1[i:i+1]) + xor(c1a, data1[i:i+1], data1[i:i+1]) + xor(c1b, data1[i:i+1], data1[i:i+1]) } c2a, _ := NewCipher(golden[0].key) c2b, _ := NewCipher(golden[1].key) data2 := make([]byte, 1<<20) - c2a.XORKeyStream(data2, data2) - c2b.XORKeyStream(data2, data2) + xor(c2a, data2, data2) + xor(c2b, data2, data2) if !bytes.Equal(data1, data2) { t.Fatalf("bad block") |