diff options
Diffstat (limited to 'src/pkg/runtime/asm_amd64.s')
-rw-r--r-- | src/pkg/runtime/asm_amd64.s | 302 |
1 files changed, 299 insertions, 3 deletions
diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s index 987958498..0dee1556d 100644 --- a/src/pkg/runtime/asm_amd64.s +++ b/src/pkg/runtime/asm_amd64.s @@ -6,8 +6,8 @@ TEXT _rt0_amd64(SB),7,$-8 // copy arguments forward on an even stack - MOVQ 0(DI), AX // argc - LEAQ 8(DI), BX // argv + MOVQ DI, AX // argc + MOVQ SI, BX // argv SUBQ $(4*8+7), SP // 2args 2auto ANDQ $~15, SP MOVQ AX, 16(SP) @@ -20,12 +20,24 @@ TEXT _rt0_amd64(SB),7,$-8 MOVQ BX, g_stackguard(DI) MOVQ SP, g_stackbase(DI) + // find out information about the processor we're on + MOVQ $0, AX + CPUID + CMPQ AX, $0 + JE nocpuinfo + MOVQ $1, AX + CPUID + MOVL CX, runtime·cpuid_ecx(SB) + MOVL DX, runtime·cpuid_edx(SB) +nocpuinfo: + // if there is an _cgo_init, call it. MOVQ _cgo_init(SB), AX TESTQ AX, AX JZ needtls // g0 already in DI MOVQ DI, CX // Win64 uses CX for first parameter + MOVQ $setmg_gcc<>(SB), SI CALL AX CMPL runtime·iswindows(SB), $0 JEQ ok @@ -65,6 +77,7 @@ ok: MOVQ AX, 8(SP) CALL runtime·args(SB) CALL runtime·osinit(SB) + CALL runtime·hashinit(SB) CALL runtime·schedinit(SB) // create a new goroutine to start program @@ -442,6 +455,12 @@ TEXT runtime·xchg(SB), 7, $0 XCHGL AX, 0(BX) RET +TEXT runtime·xchg64(SB), 7, $0 + MOVQ 8(SP), BX + MOVQ 16(SP), AX + XCHGQ AX, 0(BX) + RET + TEXT runtime·procyield(SB),7,$0 MOVL 8(SP), AX again: @@ -664,6 +683,13 @@ settls: MOVQ BX, g(CX) RET +// void setmg_gcc(M*, G*); set m and g called from gcc. +TEXT setmg_gcc<>(SB),7,$0 + get_tls(AX) + MOVQ DI, m(AX) + MOVQ SI, g(AX) + RET + // check that SP is in range [g->stackbase, g->stackguard) TEXT runtime·stackcheck(SB), 7, $0 get_tls(CX) @@ -719,7 +745,277 @@ TEXT runtime·stackguard(SB),7,$0 get_tls(CX) MOVQ g(CX), BX MOVQ g_stackguard(BX), DX - MOVQ DX, guard+8(FP) + MOVQ DX, limit+8(FP) RET GLOBL runtime·tls0(SB), $64 + +// hash function using AES hardware instructions +TEXT runtime·aeshash(SB),7,$0 + MOVQ 8(SP), DX // ptr to hash value + MOVQ 16(SP), CX // size + MOVQ 24(SP), AX // ptr to data + JMP runtime·aeshashbody(SB) + +TEXT runtime·aeshashstr(SB),7,$0 + MOVQ 8(SP), DX // ptr to hash value + MOVQ 24(SP), AX // ptr to string struct + MOVQ 8(AX), CX // length of string + MOVQ (AX), AX // string data + JMP runtime·aeshashbody(SB) + +// AX: data +// CX: length +// DX: ptr to seed input / hash output +TEXT runtime·aeshashbody(SB),7,$0 + MOVQ (DX), X0 // seed to low 64 bits of xmm0 + PINSRQ $1, CX, X0 // size to high 64 bits of xmm0 + MOVO runtime·aeskeysched+0(SB), X2 + MOVO runtime·aeskeysched+16(SB), X3 +aesloop: + CMPQ CX, $16 + JB aesloopend + MOVOU (AX), X1 + AESENC X2, X0 + AESENC X1, X0 + SUBQ $16, CX + ADDQ $16, AX + JMP aesloop +aesloopend: + TESTQ CX, CX + JE finalize // no partial block + + TESTQ $16, AX + JNE highpartial + + // address ends in 0xxxx. 16 bytes loaded + // at this address won't cross a page boundary, so + // we can load it directly. + MOVOU (AX), X1 + ADDQ CX, CX + PAND masks(SB)(CX*8), X1 + JMP partial +highpartial: + // address ends in 1xxxx. Might be up against + // a page boundary, so load ending at last byte. + // Then shift bytes down using pshufb. + MOVOU -16(AX)(CX*1), X1 + ADDQ CX, CX + PSHUFB shifts(SB)(CX*8), X1 +partial: + // incorporate partial block into hash + AESENC X3, X0 + AESENC X1, X0 +finalize: + // finalize hash + AESENC X2, X0 + AESENC X3, X0 + AESENC X2, X0 + MOVQ X0, (DX) + RET + +TEXT runtime·aeshash32(SB),7,$0 + MOVQ 8(SP), DX // ptr to hash value + MOVQ 24(SP), AX // ptr to data + MOVQ (DX), X0 // seed + PINSRD $2, (AX), X0 // data + AESENC runtime·aeskeysched+0(SB), X0 + AESENC runtime·aeskeysched+16(SB), X0 + AESENC runtime·aeskeysched+0(SB), X0 + MOVQ X0, (DX) + RET + +TEXT runtime·aeshash64(SB),7,$0 + MOVQ 8(SP), DX // ptr to hash value + MOVQ 24(SP), AX // ptr to data + MOVQ (DX), X0 // seed + PINSRQ $1, (AX), X0 // data + AESENC runtime·aeskeysched+0(SB), X0 + AESENC runtime·aeskeysched+16(SB), X0 + AESENC runtime·aeskeysched+0(SB), X0 + MOVQ X0, (DX) + RET + +// simple mask to get rid of data in the high part of the register. +TEXT masks(SB),7,$0 + QUAD $0x0000000000000000 + QUAD $0x0000000000000000 + QUAD $0x00000000000000ff + QUAD $0x0000000000000000 + QUAD $0x000000000000ffff + QUAD $0x0000000000000000 + QUAD $0x0000000000ffffff + QUAD $0x0000000000000000 + QUAD $0x00000000ffffffff + QUAD $0x0000000000000000 + QUAD $0x000000ffffffffff + QUAD $0x0000000000000000 + QUAD $0x0000ffffffffffff + QUAD $0x0000000000000000 + QUAD $0x00ffffffffffffff + QUAD $0x0000000000000000 + QUAD $0xffffffffffffffff + QUAD $0x0000000000000000 + QUAD $0xffffffffffffffff + QUAD $0x00000000000000ff + QUAD $0xffffffffffffffff + QUAD $0x000000000000ffff + QUAD $0xffffffffffffffff + QUAD $0x0000000000ffffff + QUAD $0xffffffffffffffff + QUAD $0x00000000ffffffff + QUAD $0xffffffffffffffff + QUAD $0x000000ffffffffff + QUAD $0xffffffffffffffff + QUAD $0x0000ffffffffffff + QUAD $0xffffffffffffffff + QUAD $0x00ffffffffffffff + + // these are arguments to pshufb. They move data down from + // the high bytes of the register to the low bytes of the register. + // index is how many bytes to move. +TEXT shifts(SB),7,$0 + QUAD $0x0000000000000000 + QUAD $0x0000000000000000 + QUAD $0xffffffffffffff0f + QUAD $0xffffffffffffffff + QUAD $0xffffffffffff0f0e + QUAD $0xffffffffffffffff + QUAD $0xffffffffff0f0e0d + QUAD $0xffffffffffffffff + QUAD $0xffffffff0f0e0d0c + QUAD $0xffffffffffffffff + QUAD $0xffffff0f0e0d0c0b + QUAD $0xffffffffffffffff + QUAD $0xffff0f0e0d0c0b0a + QUAD $0xffffffffffffffff + QUAD $0xff0f0e0d0c0b0a09 + QUAD $0xffffffffffffffff + QUAD $0x0f0e0d0c0b0a0908 + QUAD $0xffffffffffffffff + QUAD $0x0e0d0c0b0a090807 + QUAD $0xffffffffffffff0f + QUAD $0x0d0c0b0a09080706 + QUAD $0xffffffffffff0f0e + QUAD $0x0c0b0a0908070605 + QUAD $0xffffffffff0f0e0d + QUAD $0x0b0a090807060504 + QUAD $0xffffffff0f0e0d0c + QUAD $0x0a09080706050403 + QUAD $0xffffff0f0e0d0c0b + QUAD $0x0908070605040302 + QUAD $0xffff0f0e0d0c0b0a + QUAD $0x0807060504030201 + QUAD $0xff0f0e0d0c0b0a09 + +TEXT runtime·memeq(SB),7,$0 + MOVQ a+0(FP), SI + MOVQ b+8(FP), DI + MOVQ count+16(FP), BX + JMP runtime·memeqbody(SB) + + +TEXT bytes·Equal(SB),7,$0 + MOVQ a_len+8(FP), BX + MOVQ b_len+32(FP), CX + XORQ AX, AX + CMPQ BX, CX + JNE eqret + MOVQ a+0(FP), SI + MOVQ b+24(FP), DI + CALL runtime·memeqbody(SB) +eqret: + MOVB AX, ret+48(FP) + RET + +// a in SI +// b in DI +// count in BX +TEXT runtime·memeqbody(SB),7,$0 + XORQ AX, AX + + CMPQ BX, $8 + JB small + + // 64 bytes at a time using xmm registers +hugeloop: + CMPQ BX, $64 + JB bigloop + MOVOU (SI), X0 + MOVOU (DI), X1 + MOVOU 16(SI), X2 + MOVOU 16(DI), X3 + MOVOU 32(SI), X4 + MOVOU 32(DI), X5 + MOVOU 48(SI), X6 + MOVOU 48(DI), X7 + PCMPEQB X1, X0 + PCMPEQB X3, X2 + PCMPEQB X5, X4 + PCMPEQB X7, X6 + PAND X2, X0 + PAND X6, X4 + PAND X4, X0 + PMOVMSKB X0, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffff + JEQ hugeloop + RET + + // 8 bytes at a time using 64-bit register +bigloop: + CMPQ BX, $8 + JBE leftover + MOVQ (SI), CX + MOVQ (DI), DX + ADDQ $8, SI + ADDQ $8, DI + SUBQ $8, BX + CMPQ CX, DX + JEQ bigloop + RET + + // remaining 0-8 bytes +leftover: + MOVQ -8(SI)(BX*1), CX + MOVQ -8(DI)(BX*1), DX + CMPQ CX, DX + SETEQ AX + RET + +small: + CMPQ BX, $0 + JEQ equal + + LEAQ 0(BX*8), CX + NEGQ CX + + CMPB SI, $0xf8 + JA si_high + + // load at SI won't cross a page boundary. + MOVQ (SI), SI + JMP si_finish +si_high: + // address ends in 11111xxx. Load up to bytes we want, move to correct position. + MOVQ -8(SI)(BX*1), SI + SHRQ CX, SI +si_finish: + + // same for DI. + CMPB DI, $0xf8 + JA di_high + MOVQ (DI), DI + JMP di_finish +di_high: + MOVQ -8(DI)(BX*1), DI + SHRQ CX, DI +di_finish: + + SUBQ SI, DI + SHLQ CX, DI +equal: + SETEQ AX + RET |