diff options
Diffstat (limited to 'src/pkg/crypto/rc4/rc4_amd64.s')
-rw-r--r-- | src/pkg/crypto/rc4/rc4_amd64.s | 179 |
1 files changed, 0 insertions, 179 deletions
diff --git a/src/pkg/crypto/rc4/rc4_amd64.s b/src/pkg/crypto/rc4/rc4_amd64.s deleted file mode 100644 index e3234b6c7..000000000 --- a/src/pkg/crypto/rc4/rc4_amd64.s +++ /dev/null @@ -1,179 +0,0 @@ -// Original source: -// http://www.zorinaq.com/papers/rc4-amd64.html -// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2 - -#include "../../../cmd/ld/textflag.h" - -// Local modifications: -// -// Transliterated from GNU to 6a assembly syntax by the Go authors. -// The comments and spacing are from the original. -// -// The new EXTEND macros avoid a bad stall on some systems after 8-bit math. -// -// The original code accumulated 64 bits of key stream in an integer -// register and then XOR'ed the key stream into the data 8 bytes at a time. -// Modified to accumulate 128 bits of key stream into an XMM register -// and then XOR the key stream into the data 16 bytes at a time. -// Approximately doubles throughput. - -// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5 -// but makes the code run 2.0x slower on Xeon. -#define EXTEND(r) MOVBLZX r, r - -/* -** RC4 implementation optimized for AMD64. -** -** Author: Marc Bevand <bevand_m (at) epita.fr> -** Licence: I hereby disclaim the copyright on this code and place it -** in the public domain. -** -** The code has been designed to be easily integrated into openssl: -** the exported RC4() function can replace the actual implementations -** openssl already contains. Please note that when linking with openssl, -** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled -** with -DRC4_INT='unsigned long'. -** -** The throughput achieved by this code is about 320 MBytes/sec, on -** a 1.8 GHz AMD Opteron (rev C0) processor. -*/ - -TEXT ·xorKeyStream(SB),NOSPLIT,$0 - MOVQ n+16(FP), BX // rbx = ARG(len) - MOVQ src+8(FP), SI // in = ARG(in) - MOVQ dst+0(FP), DI // out = ARG(out) - MOVQ state+24(FP), BP // d = ARG(data) - MOVQ i+32(FP), AX - MOVBQZX 0(AX), CX // x = *xp - MOVQ j+40(FP), AX - MOVBQZX 0(AX), DX // y = *yp - - LEAQ (SI)(BX*1), R9 // limit = in+len - -l1: CMPQ SI, R9 // cmp in with in+len - JGE finished // jump if (in >= in+len) - - INCB CX - EXTEND(CX) - TESTL $15, CX - JZ wordloop - - MOVBLZX (BP)(CX*4), AX - - ADDB AX, DX // y += tx - EXTEND(DX) - MOVBLZX (BP)(DX*4), BX // ty = d[y] - MOVB BX, (BP)(CX*4) // d[x] = ty - ADDB AX, BX // val = ty+tx - EXTEND(BX) - MOVB AX, (BP)(DX*4) // d[y] = tx - MOVBLZX (BP)(BX*4), R8 // val = d[val] - XORB (SI), R8 // xor 1 byte - MOVB R8, (DI) - INCQ SI // in++ - INCQ DI // out++ - JMP l1 - -wordloop: - SUBQ $16, R9 - CMPQ SI, R9 - JGT end - -start: - ADDQ $16, SI // increment in - ADDQ $16, DI // increment out - - // Each KEYROUND generates one byte of key and - // inserts it into an XMM register at the given 16-bit index. - // The key state array is uint32 words only using the bottom - // byte of each word, so the 16-bit OR only copies 8 useful bits. - // We accumulate alternating bytes into X0 and X1, and then at - // the end we OR X1<<8 into X0 to produce the actual key. - // - // At the beginning of the loop, CX%16 == 0, so the 16 loads - // at state[CX], state[CX+1], ..., state[CX+15] can precompute - // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15], - // without fear of the byte computation CX+15 wrapping around. - // - // The first round needs R12[0], the second needs R12[1], and so on. - // We can avoid memory stalls by starting the load for round n+1 - // before the end of round n, using the LOAD macro. - LEAQ (BP)(CX*4), R12 - -#define KEYROUND(xmm, load, off, r1, r2, index) \ - MOVBLZX (BP)(DX*4), R8; \ - MOVB r1, (BP)(DX*4); \ - load((off+1), r2); \ - MOVB R8, (off*4)(R12); \ - ADDB r1, R8; \ - EXTEND(R8); \ - PINSRW $index, (BP)(R8*4), xmm - -#define LOAD(off, reg) \ - MOVBLZX (off*4)(R12), reg; \ - ADDB reg, DX; \ - EXTEND(DX) - -#define SKIP(off, reg) - - LOAD(0, AX) - KEYROUND(X0, LOAD, 0, AX, BX, 0) - KEYROUND(X1, LOAD, 1, BX, AX, 0) - KEYROUND(X0, LOAD, 2, AX, BX, 1) - KEYROUND(X1, LOAD, 3, BX, AX, 1) - KEYROUND(X0, LOAD, 4, AX, BX, 2) - KEYROUND(X1, LOAD, 5, BX, AX, 2) - KEYROUND(X0, LOAD, 6, AX, BX, 3) - KEYROUND(X1, LOAD, 7, BX, AX, 3) - KEYROUND(X0, LOAD, 8, AX, BX, 4) - KEYROUND(X1, LOAD, 9, BX, AX, 4) - KEYROUND(X0, LOAD, 10, AX, BX, 5) - KEYROUND(X1, LOAD, 11, BX, AX, 5) - KEYROUND(X0, LOAD, 12, AX, BX, 6) - KEYROUND(X1, LOAD, 13, BX, AX, 6) - KEYROUND(X0, LOAD, 14, AX, BX, 7) - KEYROUND(X1, SKIP, 15, BX, AX, 7) - - ADDB $16, CX - - PSLLQ $8, X1 - PXOR X1, X0 - MOVOU -16(SI), X2 - PXOR X0, X2 - MOVOU X2, -16(DI) - - CMPQ SI, R9 // cmp in with in+len-16 - JLE start // jump if (in <= in+len-16) - -end: - DECB CX - ADDQ $16, R9 // tmp = in+len - - // handle the last bytes, one by one -l2: CMPQ SI, R9 // cmp in with in+len - JGE finished // jump if (in >= in+len) - - INCB CX - EXTEND(CX) - MOVBLZX (BP)(CX*4), AX - - ADDB AX, DX // y += tx - EXTEND(DX) - MOVBLZX (BP)(DX*4), BX // ty = d[y] - MOVB BX, (BP)(CX*4) // d[x] = ty - ADDB AX, BX // val = ty+tx - EXTEND(BX) - MOVB AX, (BP)(DX*4) // d[y] = tx - MOVBLZX (BP)(BX*4), R8 // val = d[val] - XORB (SI), R8 // xor 1 byte - MOVB R8, (DI) - INCQ SI // in++ - INCQ DI // out++ - JMP l2 - -finished: - MOVQ j+40(FP), BX - MOVB DX, 0(BX) - MOVQ i+32(FP), AX - MOVB CX, 0(AX) - RET |