diff options
author | Michael Stapelberg <michael@stapelberg.de> | 2013-03-23 11:28:53 +0100 |
---|---|---|
committer | Michael Stapelberg <michael@stapelberg.de> | 2013-03-23 11:28:53 +0100 |
commit | b39e15dde5ec7b96c15da9faf4ab5892501c1aae (patch) | |
tree | 718cede1f6ca97d082c6c40b7dc3f4f6148253c0 /src/pkg/crypto/rc4 | |
parent | 04b08da9af0c450d645ab7389d1467308cfc2db8 (diff) | |
download | golang-upstream/1.1_hg20130323.tar.gz |
Imported Upstream version 1.1~hg20130323upstream/1.1_hg20130323
Diffstat (limited to 'src/pkg/crypto/rc4')
-rw-r--r-- | src/pkg/crypto/rc4/rc4.go | 6 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_386.s | 51 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_amd64.s | 202 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_arm.s | 10 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_asm.go | 4 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_ref.go | 2 | ||||
-rw-r--r-- | src/pkg/crypto/rc4/rc4_test.go | 69 |
7 files changed, 282 insertions, 62 deletions
diff --git a/src/pkg/crypto/rc4/rc4.go b/src/pkg/crypto/rc4/rc4.go index e0c33fa4b..3d717c63b 100644 --- a/src/pkg/crypto/rc4/rc4.go +++ b/src/pkg/crypto/rc4/rc4.go @@ -13,7 +13,7 @@ import "strconv" // A Cipher is an instance of RC4 using a particular key. type Cipher struct { - s [256]byte + s [256]uint32 i, j uint8 } @@ -32,11 +32,11 @@ func NewCipher(key []byte) (*Cipher, error) { } var c Cipher for i := 0; i < 256; i++ { - c.s[i] = uint8(i) + c.s[i] = uint32(i) } var j uint8 = 0 for i := 0; i < 256; i++ { - j += c.s[i] + key[i%k] + j += uint8(c.s[i]) + key[i%k] c.s[i], c.s[j] = c.s[j], c.s[i] } return &c, nil diff --git a/src/pkg/crypto/rc4/rc4_386.s b/src/pkg/crypto/rc4/rc4_386.s new file mode 100644 index 000000000..c80ef2a3a --- /dev/null +++ b/src/pkg/crypto/rc4/rc4_386.s @@ -0,0 +1,51 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8) +TEXT ·xorKeyStream(SB),7,$0 + MOVL dst+0(FP), DI + MOVL src+4(FP), SI + MOVL state+12(FP), BP + + MOVL i+16(FP), AX + MOVBLZX (AX), AX + MOVL j+20(FP), BX + MOVBLZX (BX), BX + CMPL n+8(FP), $0 + JEQ done + +loop: + // i += 1 + INCB AX + + // j += c.s[i] + MOVBLZX (BP)(AX*4), DX + ADDB DX, BX + MOVBLZX BX, BX + + // c.s[i], c.s[j] = c.s[j], c.s[i] + MOVBLZX (BP)(BX*4), CX + MOVB CX, (BP)(AX*4) + MOVB DX, (BP)(BX*4) + + // *dst = *src ^ c.s[c.s[i]+c.s[j]] + ADDB DX, CX + MOVBLZX CX, CX + MOVB (BP)(CX*4), CX + XORB (SI), CX + MOVBLZX CX, CX + MOVB CX, (DI) + + INCL SI + INCL DI + DECL n+8(FP) + JNE loop + +done: + MOVL i+16(FP), CX + MOVB AX, (CX) + MOVL j+20(FP), CX + MOVB BX, (CX) + + RET diff --git a/src/pkg/crypto/rc4/rc4_amd64.s b/src/pkg/crypto/rc4/rc4_amd64.s index ffe9ada85..353fe3720 100644 --- a/src/pkg/crypto/rc4/rc4_amd64.s +++ b/src/pkg/crypto/rc4/rc4_amd64.s @@ -1,53 +1,177 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Original source: +// http://www.zorinaq.com/papers/rc4-amd64.html +// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2 + +// Local modifications: +// +// Transliterated from GNU to 6a assembly syntax by the Go authors. +// The comments and spacing are from the original. +// +// The new EXTEND macros avoid a bad stall on some systems after 8-bit math. +// +// The original code accumulated 64 bits of key stream in an integer +// register and then XOR'ed the key stream into the data 8 bytes at a time. +// Modified to accumulate 128 bits of key stream into an XMM register +// and then XOR the key stream into the data 16 bytes at a time. +// Approximately doubles throughput. + +// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5 +// but makes the code run 2.0x slower on Xeon. +#define EXTEND(r) MOVBLZX r, r + +/* +** RC4 implementation optimized for AMD64. +** +** Author: Marc Bevand <bevand_m (at) epita.fr> +** Licence: I hereby disclaim the copyright on this code and place it +** in the public domain. +** +** The code has been designed to be easily integrated into openssl: +** the exported RC4() function can replace the actual implementations +** openssl already contains. Please note that when linking with openssl, +** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled +** with -DRC4_INT='unsigned long'. +** +** The throughput achieved by this code is about 320 MBytes/sec, on +** a 1.8 GHz AMD Opteron (rev C0) processor. +*/ -// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8) TEXT ·xorKeyStream(SB),7,$0 - MOVQ dst+0(FP), DI - MOVQ src+8(FP), SI - MOVQ n+16(FP), CX - MOVQ state+24(FP), R8 + MOVQ n+16(FP), BX // rbx = ARG(len) + MOVQ src+8(FP), SI // in = ARG(in) + MOVQ dst+0(FP), DI // out = ARG(out) + MOVQ state+24(FP), BP // d = ARG(data) + MOVQ i+32(FP), AX + MOVBQZX 0(AX), CX // x = *xp + MOVQ j+40(FP), AX + MOVBQZX 0(AX), DX // y = *yp + + LEAQ (SI)(BX*1), R9 // limit = in+len + +l1: CMPQ SI, R9 // cmp in with in+len + JGE finished // jump if (in >= in+len) + + INCB CX + EXTEND(CX) + TESTL $15, CX + JZ wordloop + + MOVBLZX (BP)(CX*4), AX + + ADDB AX, DX // y += tx + EXTEND(DX) + MOVBLZX (BP)(DX*4), BX // ty = d[y] + MOVB BX, (BP)(CX*4) // d[x] = ty + ADDB AX, BX // val = ty+tx + EXTEND(BX) + MOVB AX, (BP)(DX*4) // d[y] = tx + MOVBLZX (BP)(BX*4), R8 // val = d[val] + XORB (SI), R8 // xor 1 byte + MOVB R8, (DI) + INCQ SI // in++ + INCQ DI // out++ + JMP l1 + +wordloop: + SUBQ $16, R9 + CMPQ SI, R9 + JGT end + +start: + ADDQ $16, SI // increment in + ADDQ $16, DI // increment out + + // Each KEYROUND generates one byte of key and + // inserts it into an XMM register at the given 16-bit index. + // The key state array is uint32 words only using the bottom + // byte of each word, so the 16-bit OR only copies 8 useful bits. + // We accumulate alternating bytes into X0 and X1, and then at + // the end we OR X1<<8 into X0 to produce the actual key. + // + // At the beginning of the loop, CX%16 == 0, so the 16 loads + // at state[CX], state[CX+1], ..., state[CX+15] can precompute + // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15], + // without fear of the byte computation CX+15 wrapping around. + // + // The first round needs R12[0], the second needs R12[1], and so on. + // We can avoid memory stalls by starting the load for round n+1 + // before the end of round n, using the LOAD macro. + LEAQ (BP)(CX*4), R12 - MOVQ xPtr+32(FP), AX - MOVBQZX (AX), AX - MOVQ yPtr+40(FP), BX - MOVBQZX (BX), BX +#define KEYROUND(xmm, load, off, r1, r2, index) \ + MOVBLZX (BP)(DX*4), R8; \ + MOVB r1, (BP)(DX*4); \ + load((off+1), r2); \ + MOVB R8, (off*4)(R12); \ + ADDB r1, R8; \ + EXTEND(R8); \ + PINSRW $index, (BP)(R8*4), xmm -loop: - CMPQ CX, $0 - JE done +#define LOAD(off, reg) \ + MOVBLZX (off*4)(R12), reg; \ + ADDB reg, DX; \ + EXTEND(DX) - // c.i += 1 - INCB AX +#define SKIP(off, reg) - // c.j += c.s[c.i] - MOVB (R8)(AX*1), R9 - ADDB R9, BX + LOAD(0, AX) + KEYROUND(X0, LOAD, 0, AX, BX, 0) + KEYROUND(X1, LOAD, 1, BX, AX, 0) + KEYROUND(X0, LOAD, 2, AX, BX, 1) + KEYROUND(X1, LOAD, 3, BX, AX, 1) + KEYROUND(X0, LOAD, 4, AX, BX, 2) + KEYROUND(X1, LOAD, 5, BX, AX, 2) + KEYROUND(X0, LOAD, 6, AX, BX, 3) + KEYROUND(X1, LOAD, 7, BX, AX, 3) + KEYROUND(X0, LOAD, 8, AX, BX, 4) + KEYROUND(X1, LOAD, 9, BX, AX, 4) + KEYROUND(X0, LOAD, 10, AX, BX, 5) + KEYROUND(X1, LOAD, 11, BX, AX, 5) + KEYROUND(X0, LOAD, 12, AX, BX, 6) + KEYROUND(X1, LOAD, 13, BX, AX, 6) + KEYROUND(X0, LOAD, 14, AX, BX, 7) + KEYROUND(X1, SKIP, 15, BX, AX, 7) + + ADDB $16, CX - MOVBQZX (R8)(BX*1), R10 + PSLLQ $8, X1 + PXOR X1, X0 + MOVOU -16(SI), X2 + PXOR X0, X2 + MOVOU X2, -16(DI) - MOVB R10, (R8)(AX*1) - MOVB R9, (R8)(BX*1) + CMPQ SI, R9 // cmp in with in+len-16 + JLE start // jump if (in <= in+len-16) - // R11 = c.s[c.i]+c.s[c.j] - MOVQ R10, R11 - ADDB R9, R11 +end: + DECB CX + ADDQ $16, R9 // tmp = in+len - MOVB (R8)(R11*1), R11 - MOVB (SI), R12 - XORB R11, R12 - MOVB R12, (DI) + // handle the last bytes, one by one +l2: CMPQ SI, R9 // cmp in with in+len + JGE finished // jump if (in >= in+len) - INCQ SI - INCQ DI - DECQ CX + INCB CX + EXTEND(CX) + MOVBLZX (BP)(CX*4), AX - JMP loop -done: - MOVQ xPtr+32(FP), R8 - MOVB AX, (R8) - MOVQ yPtr+40(FP), R8 - MOVB BX, (R8) + ADDB AX, DX // y += tx + EXTEND(DX) + MOVBLZX (BP)(DX*4), BX // ty = d[y] + MOVB BX, (BP)(CX*4) // d[x] = ty + ADDB AX, BX // val = ty+tx + EXTEND(BX) + MOVB AX, (BP)(DX*4) // d[y] = tx + MOVBLZX (BP)(BX*4), R8 // val = d[val] + XORB (SI), R8 // xor 1 byte + MOVB R8, (DI) + INCQ SI // in++ + INCQ DI // out++ + JMP l2 +finished: + MOVQ j+40(FP), BX + MOVB DX, 0(BX) + MOVQ i+32(FP), AX + MOVB CX, 0(AX) RET diff --git a/src/pkg/crypto/rc4/rc4_arm.s b/src/pkg/crypto/rc4/rc4_arm.s index 51a332f62..307cb7148 100644 --- a/src/pkg/crypto/rc4/rc4_arm.s +++ b/src/pkg/crypto/rc4/rc4_arm.s @@ -31,19 +31,19 @@ loop: // i += 1; j += state[i] ADD $1, R(i) AND $0xff, R(i) - MOVBU R(i)<<0(R(state)), R(t) + MOVBU R(i)<<2(R(state)), R(t) ADD R(t), R(j) AND $0xff, R(j) // swap state[i] <-> state[j] - MOVBU R(j)<<0(R(state)), R(t2) - MOVB R(t2), R(i)<<0(R(state)) - MOVB R(t), R(j)<<0(R(state)) + MOVBU R(j)<<2(R(state)), R(t2) + MOVB R(t2), R(i)<<2(R(state)) + MOVB R(t), R(j)<<2(R(state)) // dst[k] = src[k] ^ state[state[i] + state[j]] ADD R(t2), R(t) AND $0xff, R(t) - MOVBU R(t)<<0(R(state)), R(t) + MOVBU R(t)<<2(R(state)), R(t) MOVBU R(k)<<0(R(src)), R(t2) EOR R(t), R(t2) MOVB R(t2), R(k)<<0(R(dst)) diff --git a/src/pkg/crypto/rc4/rc4_asm.go b/src/pkg/crypto/rc4/rc4_asm.go index 0b66e4a9e..c582a4488 100644 --- a/src/pkg/crypto/rc4/rc4_asm.go +++ b/src/pkg/crypto/rc4/rc4_asm.go @@ -2,11 +2,11 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 arm +// +build amd64 arm 386 package rc4 -func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8) +func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8) // XORKeyStream sets dst to the result of XORing src with the key stream. // Dst and src may be the same slice but otherwise should not overlap. diff --git a/src/pkg/crypto/rc4/rc4_ref.go b/src/pkg/crypto/rc4/rc4_ref.go index 1018548c2..44d380436 100644 --- a/src/pkg/crypto/rc4/rc4_ref.go +++ b/src/pkg/crypto/rc4/rc4_ref.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!arm +// +build !amd64,!arm,!386 package rc4 diff --git a/src/pkg/crypto/rc4/rc4_test.go b/src/pkg/crypto/rc4/rc4_test.go index 9e12789f7..7b4df6791 100644 --- a/src/pkg/crypto/rc4/rc4_test.go +++ b/src/pkg/crypto/rc4/rc4_test.go @@ -5,6 +5,8 @@ package rc4 import ( + "bytes" + "fmt" "testing" ) @@ -72,25 +74,68 @@ var golden = []rc4Test{ }, } +func testEncrypt(t *testing.T, desc string, c *Cipher, src, expect []byte) { + dst := make([]byte, len(src)) + c.XORKeyStream(dst, src) + for i, v := range dst { + if v != expect[i] { + t.Fatalf("%s: mismatch at byte %d:\nhave %x\nwant %x", desc, i, dst, expect) + } + } +} + func TestGolden(t *testing.T) { - for i := 0; i < len(golden); i++ { - g := golden[i] - c, err := NewCipher(g.key) - if err != nil { - t.Errorf("Failed to create cipher at golden index %d", i) - return + for gi, g := range golden { + data := make([]byte, len(g.keystream)) + for i := range data { + data[i] = byte(i) } - keystream := make([]byte, len(g.keystream)) - c.XORKeyStream(keystream, keystream) - for j, v := range keystream { - if g.keystream[j] != v { - t.Errorf("Failed at golden index %d:\n%x\nvs\n%x", i, keystream, g.keystream) - break + + expect := make([]byte, len(g.keystream)) + for i := range expect { + expect[i] = byte(i) ^ g.keystream[i] + } + + for size := 1; size <= len(g.keystream); size++ { + c, err := NewCipher(g.key) + if err != nil { + t.Fatalf("#%d: NewCipher: %v", gi, err) + } + + off := 0 + for off < len(g.keystream) { + n := len(g.keystream) - off + if n > size { + n = size + } + desc := fmt.Sprintf("#%d@[%d:%d]", gi, off, off+n) + testEncrypt(t, desc, c, data[off:off+n], expect[off:off+n]) + off += n } } } } +func TestBlock(t *testing.T) { + c1a, _ := NewCipher(golden[0].key) + c1b, _ := NewCipher(golden[1].key) + data1 := make([]byte, 1<<20) + for i := range data1 { + c1a.XORKeyStream(data1[i:i+1], data1[i:i+1]) + c1b.XORKeyStream(data1[i:i+1], data1[i:i+1]) + } + + c2a, _ := NewCipher(golden[0].key) + c2b, _ := NewCipher(golden[1].key) + data2 := make([]byte, 1<<20) + c2a.XORKeyStream(data2, data2) + c2b.XORKeyStream(data2, data2) + + if !bytes.Equal(data1, data2) { + t.Fatalf("bad block") + } +} + func benchmark(b *testing.B, size int64) { buf := make([]byte, size) c, err := NewCipher(golden[0].key) |