summaryrefslogtreecommitdiff
path: root/src/pkg/runtime/asm_386.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/runtime/asm_386.s')
-rw-r--r--src/pkg/runtime/asm_386.s844
1 files changed, 548 insertions, 296 deletions
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index 531057ff8..5c642c0ed 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -3,8 +3,10 @@
// license that can be found in the LICENSE file.
#include "zasm_GOOS_GOARCH.h"
+#include "funcdata.h"
+#include "../../cmd/ld/textflag.h"
-TEXT _rt0_386(SB),7,$0
+TEXT _rt0_go(SB),NOSPLIT,$0
// copy arguments forward on an even stack
MOVL argc+0(FP), AX
MOVL argv+4(FP), BX
@@ -18,6 +20,7 @@ TEXT _rt0_386(SB),7,$0
MOVL $runtime·g0(SB), BP
LEAL (-64*1024+104)(SP), BX
MOVL BX, g_stackguard(BP)
+ MOVL BX, g_stackguard0(BP)
MOVL SP, g_stackbase(BP)
// find out information about the processor we're on
@@ -41,6 +44,10 @@ nocpuinfo:
MOVL BX, 4(SP)
MOVL BP, 0(SP)
CALL AX
+ // update stackguard after _cgo_init
+ MOVL $runtime·g0(SB), CX
+ MOVL g_stackguard0(CX), AX
+ MOVL AX, g_stackguard(CX)
// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
CMPL runtime·iswindows(SB), $0
JEQ ok
@@ -90,7 +97,9 @@ ok:
// create a new goroutine to start program
PUSHL $runtime·main·f(SB) // entry
PUSHL $0 // arg size
+ ARGSIZE(8)
CALL runtime·newproc(SB)
+ ARGSIZE(-1)
POPL AX
POPL AX
@@ -101,13 +110,13 @@ ok:
RET
DATA runtime·main·f+0(SB)/4,$runtime·main(SB)
-GLOBL runtime·main·f(SB),8,$4
+GLOBL runtime·main·f(SB),RODATA,$4
-TEXT runtime·breakpoint(SB),7,$0
+TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
INT $3
RET
-TEXT runtime·asminit(SB),7,$0
+TEXT runtime·asminit(SB),NOSPLIT,$0-0
// Linux and MinGW start the FPU in extended double precision.
// Other operating systems use double precision.
// Change to double precision to match them,
@@ -123,73 +132,45 @@ TEXT runtime·asminit(SB),7,$0
// void gosave(Gobuf*)
// save state in Gobuf; setjmp
-TEXT runtime·gosave(SB), 7, $0
+TEXT runtime·gosave(SB), NOSPLIT, $0-4
MOVL 4(SP), AX // gobuf
LEAL 4(SP), BX // caller's SP
MOVL BX, gobuf_sp(AX)
MOVL 0(SP), BX // caller's PC
MOVL BX, gobuf_pc(AX)
+ MOVL $0, gobuf_ret(AX)
+ MOVL $0, gobuf_ctxt(AX)
get_tls(CX)
MOVL g(CX), BX
MOVL BX, gobuf_g(AX)
RET
-// void gogo(Gobuf*, uintptr)
+// void gogo(Gobuf*)
// restore state from Gobuf; longjmp
-TEXT runtime·gogo(SB), 7, $0
- MOVL 8(SP), AX // return 2nd arg
+TEXT runtime·gogo(SB), NOSPLIT, $0-4
MOVL 4(SP), BX // gobuf
MOVL gobuf_g(BX), DX
MOVL 0(DX), CX // make sure g != nil
get_tls(CX)
MOVL DX, g(CX)
MOVL gobuf_sp(BX), SP // restore SP
+ MOVL gobuf_ret(BX), AX
+ MOVL gobuf_ctxt(BX), DX
+ MOVL $0, gobuf_sp(BX) // clear to help garbage collector
+ MOVL $0, gobuf_ret(BX)
+ MOVL $0, gobuf_ctxt(BX)
MOVL gobuf_pc(BX), BX
JMP BX
-// void gogocall(Gobuf*, void (*fn)(void), uintptr r0)
-// restore state from Gobuf but then call fn.
-// (call fn, returning to state in Gobuf)
-TEXT runtime·gogocall(SB), 7, $0
- MOVL 12(SP), DX // context
- MOVL 8(SP), AX // fn
- MOVL 4(SP), BX // gobuf
- MOVL gobuf_g(BX), DI
- get_tls(CX)
- MOVL DI, g(CX)
- MOVL 0(DI), CX // make sure g != nil
- MOVL gobuf_sp(BX), SP // restore SP
- MOVL gobuf_pc(BX), BX
- PUSHL BX
- JMP AX
- POPL BX // not reached
-
-// void gogocallfn(Gobuf*, FuncVal*)
-// restore state from Gobuf but then call fn.
-// (call fn, returning to state in Gobuf)
-TEXT runtime·gogocallfn(SB), 7, $0
- MOVL 8(SP), DX // fn
- MOVL 4(SP), BX // gobuf
- MOVL gobuf_g(BX), DI
- get_tls(CX)
- MOVL DI, g(CX)
- MOVL 0(DI), CX // make sure g != nil
- MOVL gobuf_sp(BX), SP // restore SP
- MOVL gobuf_pc(BX), BX
- PUSHL BX
- MOVL 0(DX), BX
- JMP BX
- POPL BX // not reached
-
// void mcall(void (*fn)(G*))
// Switch to m->g0's stack, call fn(g).
// Fn must never return. It should gogo(&g->sched)
// to keep running g.
-TEXT runtime·mcall(SB), 7, $0
+TEXT runtime·mcall(SB), NOSPLIT, $0-4
MOVL fn+0(FP), DI
get_tls(CX)
- MOVL g(CX), AX // save state in g->gobuf
+ MOVL g(CX), AX // save state in g->sched
MOVL 0(SP), BX // caller's PC
MOVL BX, (g_sched+gobuf_pc)(AX)
LEAL 4(SP), BX // caller's SP
@@ -200,14 +181,16 @@ TEXT runtime·mcall(SB), 7, $0
MOVL m(CX), BX
MOVL m_g0(BX), SI
CMPL SI, AX // if g == m->g0 call badmcall
- JNE 2(PC)
- CALL runtime·badmcall(SB)
+ JNE 3(PC)
+ MOVL $runtime·badmcall(SB), AX
+ JMP AX
MOVL SI, g(CX) // g = m->g0
- MOVL (g_sched+gobuf_sp)(SI), SP // sp = m->g0->gobuf.sp
+ MOVL (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp
PUSHL AX
CALL DI
POPL AX
- CALL runtime·badmcall2(SB)
+ MOVL $runtime·badmcall2(SB), AX
+ JMP AX
RET
/*
@@ -215,7 +198,12 @@ TEXT runtime·mcall(SB), 7, $0
*/
// Called during function prolog when more stack is needed.
-TEXT runtime·morestack(SB),7,$0
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT,$0-0
// Cannot grow scheduler stack (m->g0).
get_tls(CX)
MOVL m(CX), BX
@@ -223,8 +211,6 @@ TEXT runtime·morestack(SB),7,$0
CMPL g(CX), SI
JNE 2(PC)
INT $3
-
- MOVL DX, m_cret(BX)
// frame size in DI
// arg size in AX
@@ -243,9 +229,13 @@ TEXT runtime·morestack(SB),7,$0
MOVL g(CX), SI
MOVL SI, (m_morebuf+gobuf_g)(BX)
- // Set m->morepc to f's PC.
- MOVL 0(SP), AX
- MOVL AX, m_morepc(BX)
+ // Set g->sched to context in f.
+ MOVL 0(SP), AX // f's PC
+ MOVL AX, (g_sched+gobuf_pc)(SI)
+ MOVL SI, (g_sched+gobuf_g)(SI)
+ LEAL 4(SP), AX // f's SP
+ MOVL AX, (g_sched+gobuf_sp)(SI)
+ MOVL DX, (g_sched+gobuf_ctxt)(SI)
// Call newstack on m->g0's stack.
MOVL m_g0(BX), BP
@@ -257,12 +247,12 @@ TEXT runtime·morestack(SB),7,$0
MOVL $0, 0x1003 // crash if newstack returns
RET
-// Called from reflection library. Mimics morestack,
+// Called from panic. Mimics morestack,
// reuses stack growth code to create a frame
// with the desired args running the desired function.
//
// func call(fn *byte, arg *byte, argsize uint32).
-TEXT reflect·call(SB), 7, $0
+TEXT runtime·newstackcall(SB), NOSPLIT, $0-12
get_tls(CX)
MOVL m(CX), BX
@@ -275,9 +265,14 @@ TEXT reflect·call(SB), 7, $0
MOVL g(CX), AX
MOVL AX, (m_morebuf+gobuf_g)(BX)
+ // Save our own state as the PC and SP to restore
+ // if this goroutine needs to be restarted.
+ MOVL $runtime·newstackcall(SB), (g_sched+gobuf_pc)(AX)
+ MOVL SP, (g_sched+gobuf_sp)(AX)
+
// Set up morestack arguments to call f on a new stack.
// We set f's frame size to 1, as a hint to newstack
- // that this is a call from reflect·call.
+ // that this is a call from runtime·newstackcall.
// If it turns out that f needs a larger frame than
// the default stack, f's usual stack growth prolog will
// allocate a new segment (and recopy the arguments).
@@ -285,7 +280,7 @@ TEXT reflect·call(SB), 7, $0
MOVL 8(SP), DX // arg frame
MOVL 12(SP), CX // arg size
- MOVL AX, m_morepc(BX) // f's PC
+ MOVL AX, m_cret(BX) // f's PC
MOVL DX, m_moreargp(BX) // f's argument pointer
MOVL CX, m_moreargsize(BX) // f's argument size
MOVL $1, m_moreframesize(BX) // f's frame size
@@ -299,9 +294,101 @@ TEXT reflect·call(SB), 7, $0
MOVL $0, 0x1103 // crash if newstack returns
RET
+// reflect·call: call a function with the given argument list
+// func call(f *FuncVal, arg *byte, argsize uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE) \
+ CMPL CX, $MAXSIZE; \
+ JA 3(PC); \
+ MOVL $runtime·NAME(SB), AX; \
+ JMP AX
+// Note: can't just "JMP runtime·NAME(SB)" - bad inlining results.
+
+TEXT reflect·call(SB), NOSPLIT, $0-12
+ MOVL argsize+8(FP), CX
+ DISPATCH(call16, 16)
+ DISPATCH(call32, 32)
+ DISPATCH(call64, 64)
+ DISPATCH(call128, 128)
+ DISPATCH(call256, 256)
+ DISPATCH(call512, 512)
+ DISPATCH(call1024, 1024)
+ DISPATCH(call2048, 2048)
+ DISPATCH(call4096, 4096)
+ DISPATCH(call8192, 8192)
+ DISPATCH(call16384, 16384)
+ DISPATCH(call32768, 32768)
+ DISPATCH(call65536, 65536)
+ DISPATCH(call131072, 131072)
+ DISPATCH(call262144, 262144)
+ DISPATCH(call524288, 524288)
+ DISPATCH(call1048576, 1048576)
+ DISPATCH(call2097152, 2097152)
+ DISPATCH(call4194304, 4194304)
+ DISPATCH(call8388608, 8388608)
+ DISPATCH(call16777216, 16777216)
+ DISPATCH(call33554432, 33554432)
+ DISPATCH(call67108864, 67108864)
+ DISPATCH(call134217728, 134217728)
+ DISPATCH(call268435456, 268435456)
+ DISPATCH(call536870912, 536870912)
+ DISPATCH(call1073741824, 1073741824)
+ MOVL $runtime·badreflectcall(SB), AX
+ JMP AX
+
+#define CALLFN(NAME,MAXSIZE) \
+TEXT runtime·NAME(SB), WRAPPER, $MAXSIZE-12; \
+ /* copy arguments to stack */ \
+ MOVL argptr+4(FP), SI; \
+ MOVL argsize+8(FP), CX; \
+ MOVL SP, DI; \
+ REP;MOVSB; \
+ /* call function */ \
+ MOVL f+0(FP), DX; \
+ CALL (DX); \
+ /* copy return values back */ \
+ MOVL argptr+4(FP), DI; \
+ MOVL argsize+8(FP), CX; \
+ MOVL SP, SI; \
+ REP;MOVSB; \
+ RET
+
+CALLFN(call16, 16)
+CALLFN(call32, 32)
+CALLFN(call64, 64)
+CALLFN(call128, 128)
+CALLFN(call256, 256)
+CALLFN(call512, 512)
+CALLFN(call1024, 1024)
+CALLFN(call2048, 2048)
+CALLFN(call4096, 4096)
+CALLFN(call8192, 8192)
+CALLFN(call16384, 16384)
+CALLFN(call32768, 32768)
+CALLFN(call65536, 65536)
+CALLFN(call131072, 131072)
+CALLFN(call262144, 262144)
+CALLFN(call524288, 524288)
+CALLFN(call1048576, 1048576)
+CALLFN(call2097152, 2097152)
+CALLFN(call4194304, 4194304)
+CALLFN(call8388608, 8388608)
+CALLFN(call16777216, 16777216)
+CALLFN(call33554432, 33554432)
+CALLFN(call67108864, 67108864)
+CALLFN(call134217728, 134217728)
+CALLFN(call268435456, 268435456)
+CALLFN(call536870912, 536870912)
+CALLFN(call1073741824, 1073741824)
// Return point when leaving stack.
-TEXT runtime·lessstack(SB), 7, $0
+//
+// Lessstack can appear in stack traces for the same reason
+// as morestack; in that context, it has 0 arguments.
+TEXT runtime·lessstack(SB), NOSPLIT, $0-0
// Save return value in m->cret
get_tls(CX)
MOVL m(CX), BX
@@ -323,7 +410,7 @@ TEXT runtime·lessstack(SB), 7, $0
// return 1;
// }else
// return 0;
-TEXT runtime·cas(SB), 7, $0
+TEXT runtime·cas(SB), NOSPLIT, $0-12
MOVL 4(SP), BX
MOVL 8(SP), AX
MOVL 12(SP), CX
@@ -335,30 +422,26 @@ TEXT runtime·cas(SB), 7, $0
MOVL $1, AX
RET
-// bool runtime·cas64(uint64 *val, uint64 *old, uint64 new)
+// bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
// Atomically:
// if(*val == *old){
// *val = new;
// return 1;
// } else {
-// *old = *val
// return 0;
// }
-TEXT runtime·cas64(SB), 7, $0
+TEXT runtime·cas64(SB), NOSPLIT, $0-20
MOVL 4(SP), BP
- MOVL 8(SP), SI
- MOVL 0(SI), AX
- MOVL 4(SI), DX
- MOVL 12(SP), BX
- MOVL 16(SP), CX
+ MOVL 8(SP), AX
+ MOVL 12(SP), DX
+ MOVL 16(SP), BX
+ MOVL 20(SP), CX
LOCK
CMPXCHG8B 0(BP)
JNZ cas64_fail
MOVL $1, AX
RET
cas64_fail:
- MOVL AX, 0(SI)
- MOVL DX, 4(SI)
MOVL $0, AX
RET
@@ -369,7 +452,7 @@ cas64_fail:
// return 1;
// }else
// return 0;
-TEXT runtime·casp(SB), 7, $0
+TEXT runtime·casp(SB), NOSPLIT, $0-12
MOVL 4(SP), BX
MOVL 8(SP), AX
MOVL 12(SP), CX
@@ -385,7 +468,7 @@ TEXT runtime·casp(SB), 7, $0
// Atomically:
// *val += delta;
// return *val;
-TEXT runtime·xadd(SB), 7, $0
+TEXT runtime·xadd(SB), NOSPLIT, $0-8
MOVL 4(SP), BX
MOVL 8(SP), AX
MOVL AX, CX
@@ -394,13 +477,13 @@ TEXT runtime·xadd(SB), 7, $0
ADDL CX, AX
RET
-TEXT runtime·xchg(SB), 7, $0
+TEXT runtime·xchg(SB), NOSPLIT, $0-8
MOVL 4(SP), BX
MOVL 8(SP), AX
XCHGL AX, 0(BX)
RET
-TEXT runtime·procyield(SB),7,$0
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
MOVL 4(SP), AX
again:
PAUSE
@@ -408,13 +491,13 @@ again:
JNZ again
RET
-TEXT runtime·atomicstorep(SB), 7, $0
+TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8
MOVL 4(SP), BX
MOVL 8(SP), AX
XCHGL AX, 0(BX)
RET
-TEXT runtime·atomicstore(SB), 7, $0
+TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
MOVL 4(SP), BX
MOVL 8(SP), AX
XCHGL AX, 0(BX)
@@ -423,8 +506,8 @@ TEXT runtime·atomicstore(SB), 7, $0
// uint64 atomicload64(uint64 volatile* addr);
// so actually
// void atomicload64(uint64 *res, uint64 volatile *addr);
-TEXT runtime·atomicload64(SB), 7, $0
- MOVL 4(SP), BX
+TEXT runtime·atomicload64(SB), NOSPLIT, $0-8
+ MOVL 4(SP), BX
MOVL 8(SP), AX
// MOVQ (%EAX), %MM0
BYTE $0x0f; BYTE $0x6f; BYTE $0x00
@@ -435,7 +518,7 @@ TEXT runtime·atomicload64(SB), 7, $0
RET
// void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
-TEXT runtime·atomicstore64(SB), 7, $0
+TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
MOVL 4(SP), AX
// MOVQ and EMMS were introduced on the Pentium MMX.
// MOVQ 0x8(%ESP), %MM0
@@ -456,7 +539,7 @@ TEXT runtime·atomicstore64(SB), 7, $0
// 1. pop the caller
// 2. sub 5 bytes from the callers return
// 3. jmp to the argument
-TEXT runtime·jmpdefer(SB), 7, $0
+TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
MOVL 4(SP), DX // fn
MOVL 8(SP), BX // caller sp
LEAL -4(BX), SP // caller sp after CALL
@@ -464,18 +547,27 @@ TEXT runtime·jmpdefer(SB), 7, $0
MOVL 0(DX), BX
JMP BX // but first run the deferred function
-// Dummy function to use in saved gobuf.PC,
-// to match SP pointing at a return address.
-// The gobuf.PC is unused by the contortions here
-// but setting it to return will make the traceback code work.
-TEXT return<>(SB),7,$0
+// Save state of caller into g->sched.
+TEXT gosave<>(SB),NOSPLIT,$0
+ PUSHL AX
+ PUSHL BX
+ get_tls(BX)
+ MOVL g(BX), BX
+ LEAL arg+0(FP), AX
+ MOVL AX, (g_sched+gobuf_sp)(BX)
+ MOVL -4(AX), AX
+ MOVL AX, (g_sched+gobuf_pc)(BX)
+ MOVL $0, (g_sched+gobuf_ret)(BX)
+ MOVL $0, (g_sched+gobuf_ctxt)(BX)
+ POPL BX
+ POPL AX
RET
// asmcgocall(void(*fn)(void*), void *arg)
// Call fn(arg) on the scheduler stack,
// aligned appropriately for the gcc ABI.
// See cgocall.c for more details.
-TEXT runtime·asmcgocall(SB),7,$0
+TEXT runtime·asmcgocall(SB),NOSPLIT,$0-8
MOVL fn+0(FP), AX
MOVL arg+4(FP), BX
MOVL SP, DX
@@ -488,10 +580,8 @@ TEXT runtime·asmcgocall(SB),7,$0
MOVL m_g0(BP), SI
MOVL g(CX), DI
CMPL SI, DI
- JEQ 6(PC)
- MOVL SP, (g_sched+gobuf_sp)(DI)
- MOVL $return<>(SB), (g_sched+gobuf_pc)(DI)
- MOVL DI, (g_sched+gobuf_g)(DI)
+ JEQ 4(PC)
+ CALL gosave<>(SB)
MOVL SI, g(CX)
MOVL (g_sched+gobuf_sp)(SI), SP
@@ -513,7 +603,7 @@ TEXT runtime·asmcgocall(SB),7,$0
// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
// Turn the fn into a Go func (by taking its address) and call
// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),7,$12
+TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
LEAL fn+0(FP), AX
MOVL AX, 0(SP)
MOVL frame+4(FP), AX
@@ -526,7 +616,7 @@ TEXT runtime·cgocallback(SB),7,$12
// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
// See cgocall.c for more details.
-TEXT runtime·cgocallback_gofunc(SB),7,$12
+TEXT runtime·cgocallback_gofunc(SB),NOSPLIT,$12-12
// If m is nil, Go did not create the current thread.
// Call needm to obtain one for temporary use.
// In this case, we're running on the thread stack, so there's
@@ -534,18 +624,19 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12
// the linker analysis by using an indirect call through AX.
get_tls(CX)
#ifdef GOOS_windows
+ MOVL $0, BP
CMPL CX, $0
- JNE 3(PC)
- PUSHL $0
- JMP needm
+ JEQ 2(PC)
#endif
MOVL m(CX), BP
- PUSHL BP
+ MOVL BP, DX // saved copy of oldm
CMPL BP, $0
JNE havem
needm:
+ MOVL DX, 0(SP)
MOVL $runtime·needm(SB), AX
CALL AX
+ MOVL 0(SP), DX
get_tls(CX)
MOVL m(CX), BP
@@ -554,50 +645,40 @@ havem:
// Save current m->g0->sched.sp on stack and then set it to SP.
// Save current sp in m->g0->sched.sp in preparation for
// switch back to m->curg stack.
+ // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
+ // On Windows, the SEH is at 4(SP) and 8(SP).
MOVL m_g0(BP), SI
- PUSHL (g_sched+gobuf_sp)(SI)
+ MOVL (g_sched+gobuf_sp)(SI), AX
+ MOVL AX, 0(SP)
MOVL SP, (g_sched+gobuf_sp)(SI)
- // Switch to m->curg stack and call runtime.cgocallbackg
- // with the three arguments. Because we are taking over
- // the execution of m->curg but *not* resuming what had
- // been running, we need to save that information (m->curg->gobuf)
- // so that we can restore it when we're done.
- // We can restore m->curg->gobuf.sp easily, because calling
+ // Switch to m->curg stack and call runtime.cgocallbackg.
+ // Because we are taking over the execution of m->curg
+ // but *not* resuming what had been running, we need to
+ // save that information (m->curg->sched) so we can restore it.
+ // We can restore m->curg->sched.sp easily, because calling
// runtime.cgocallbackg leaves SP unchanged upon return.
- // To save m->curg->gobuf.pc, we push it onto the stack.
+ // To save m->curg->sched.pc, we push it onto the stack.
// This has the added benefit that it looks to the traceback
// routine like cgocallbackg is going to return to that
- // PC (because we defined cgocallbackg to have
- // a frame size of 12, the same amount that we use below),
+ // PC (because the frame we allocate below has the same
+ // size as cgocallback_gofunc's frame declared above)
// so that the traceback will seamlessly trace back into
// the earlier calls.
- MOVL fn+0(FP), AX
- MOVL frame+4(FP), BX
- MOVL framesize+8(FP), DX
-
+ //
+ // In the new goroutine, 0(SP) holds the saved oldm (DX) register.
+ // 4(SP) and 8(SP) are unused.
MOVL m_curg(BP), SI
MOVL SI, g(CX)
- MOVL (g_sched+gobuf_sp)(SI), DI // prepare stack as DI
-
- // Push gobuf.pc
+ MOVL (g_sched+gobuf_sp)(SI), DI // prepare stack as DI
MOVL (g_sched+gobuf_pc)(SI), BP
- SUBL $4, DI
- MOVL BP, 0(DI)
-
- // Push arguments to cgocallbackg.
- // Frame size here must match the frame size above
- // to trick traceback routines into doing the right thing.
- SUBL $12, DI
- MOVL AX, 0(DI)
- MOVL BX, 4(DI)
- MOVL DX, 8(DI)
-
- // Switch stack and make the call.
- MOVL DI, SP
+ MOVL BP, -4(DI)
+ LEAL -(4+12)(DI), SP
+ MOVL DX, 0(SP)
CALL runtime·cgocallbackg(SB)
+ MOVL 0(SP), DX
- // Restore g->gobuf (== m->curg->gobuf) from saved values.
+ // Restore g->sched (== m->curg->sched) from saved values.
get_tls(CX)
MOVL g(CX), SI
MOVL 12(SP), BP
@@ -612,12 +693,12 @@ havem:
MOVL m_g0(BP), SI
MOVL SI, g(CX)
MOVL (g_sched+gobuf_sp)(SI), SP
- POPL (g_sched+gobuf_sp)(SI)
+ MOVL 0(SP), AX
+ MOVL AX, (g_sched+gobuf_sp)(SI)
// If the m on entry was nil, we called needm above to borrow an m
// for the duration of the call. Since the call is over, return it with dropm.
- POPL BP
- CMPL BP, $0
+ CMPL DX, $0
JNE 3(PC)
MOVL $runtime·dropm(SB), AX
CALL AX
@@ -626,7 +707,7 @@ havem:
RET
// void setmg(M*, G*); set m and g. for use by needm.
-TEXT runtime·setmg(SB), 7, $0
+TEXT runtime·setmg(SB), NOSPLIT, $0-8
#ifdef GOOS_windows
MOVL mm+0(FP), AX
CMPL AX, $0
@@ -646,7 +727,7 @@ settls:
RET
// void setmg_gcc(M*, G*); set m and g. for use by gcc
-TEXT setmg_gcc<>(SB), 7, $0
+TEXT setmg_gcc<>(SB), NOSPLIT, $0
get_tls(AX)
MOVL mm+0(FP), DX
MOVL DX, m(AX)
@@ -655,7 +736,7 @@ TEXT setmg_gcc<>(SB), 7, $0
RET
// check that SP is in range [g->stackbase, g->stackguard)
-TEXT runtime·stackcheck(SB), 7, $0
+TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
get_tls(CX)
MOVL g(CX), AX
CMPL g_stackbase(AX), SP
@@ -666,7 +747,7 @@ TEXT runtime·stackcheck(SB), 7, $0
INT $3
RET
-TEXT runtime·memclr(SB),7,$0
+TEXT runtime·memclr(SB),NOSPLIT,$0-8
MOVL 4(SP), DI // arg 1 addr
MOVL 8(SP), CX // arg 2 count
MOVL CX, BX
@@ -681,31 +762,31 @@ TEXT runtime·memclr(SB),7,$0
STOSB
RET
-TEXT runtime·getcallerpc(SB),7,$0
+TEXT runtime·getcallerpc(SB),NOSPLIT,$0-4
MOVL x+0(FP),AX // addr of first arg
MOVL -4(AX),AX // get calling pc
RET
-TEXT runtime·setcallerpc(SB),7,$0
+TEXT runtime·setcallerpc(SB),NOSPLIT,$0-8
MOVL x+0(FP),AX // addr of first arg
MOVL x+4(FP), BX
MOVL BX, -4(AX) // set calling pc
RET
-TEXT runtime·getcallersp(SB), 7, $0
+TEXT runtime·getcallersp(SB), NOSPLIT, $0-4
MOVL sp+0(FP), AX
RET
// int64 runtime·cputicks(void), so really
// void runtime·cputicks(int64 *ticks)
-TEXT runtime·cputicks(SB),7,$0
+TEXT runtime·cputicks(SB),NOSPLIT,$0-4
RDTSC
MOVL ret+0(FP), DI
MOVL AX, 0(DI)
MOVL DX, 4(DI)
RET
-TEXT runtime·ldt0setup(SB),7,$16
+TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
// set up ldt 7 to point at tls0
// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
// the entry number is just a hint. setldt will set up GS with what it used.
@@ -716,13 +797,13 @@ TEXT runtime·ldt0setup(SB),7,$16
CALL runtime·setldt(SB)
RET
-TEXT runtime·emptyfunc(SB),0,$0
+TEXT runtime·emptyfunc(SB),0,$0-0
RET
-TEXT runtime·abort(SB),7,$0
+TEXT runtime·abort(SB),NOSPLIT,$0-0
INT $0x3
-TEXT runtime·stackguard(SB),7,$0
+TEXT runtime·stackguard(SB),NOSPLIT,$0-8
MOVL SP, DX
MOVL DX, sp+0(FP)
get_tls(CX)
@@ -734,13 +815,13 @@ TEXT runtime·stackguard(SB),7,$0
GLOBL runtime·tls0(SB), $32
// hash function using AES hardware instructions
-TEXT runtime·aeshash(SB),7,$0
+TEXT runtime·aeshash(SB),NOSPLIT,$0-12
MOVL 4(SP), DX // ptr to hash value
MOVL 8(SP), CX // size
MOVL 12(SP), AX // ptr to data
JMP runtime·aeshashbody(SB)
-TEXT runtime·aeshashstr(SB),7,$0
+TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
MOVL 4(SP), DX // ptr to hash value
MOVL 12(SP), AX // ptr to string struct
MOVL 4(AX), CX // length of string
@@ -750,41 +831,49 @@ TEXT runtime·aeshashstr(SB),7,$0
// AX: data
// CX: length
// DX: ptr to seed input / hash output
-TEXT runtime·aeshashbody(SB),7,$0
+TEXT runtime·aeshashbody(SB),NOSPLIT,$0-12
MOVL (DX), X0 // seed to low 32 bits of xmm0
PINSRD $1, CX, X0 // size to next 32 bits of xmm0
MOVO runtime·aeskeysched+0(SB), X2
MOVO runtime·aeskeysched+16(SB), X3
+ CMPL CX, $16
+ JB aessmall
aesloop:
CMPL CX, $16
- JB aesloopend
+ JBE aesloopend
MOVOU (AX), X1
AESENC X2, X0
AESENC X1, X0
SUBL $16, CX
ADDL $16, AX
JMP aesloop
+// 1-16 bytes remaining
aesloopend:
+ // This load may overlap with the previous load above.
+ // We'll hash some bytes twice, but that's ok.
+ MOVOU -16(AX)(CX*1), X1
+ JMP partial
+// 0-15 bytes
+aessmall:
TESTL CX, CX
- JE finalize // no partial block
+ JE finalize // 0 bytes
- TESTL $16, AX
- JNE highpartial
+ CMPB AX, $0xf0
+ JA highpartial
- // address ends in 0xxxx. 16 bytes loaded
- // at this address won't cross a page boundary, so
- // we can load it directly.
+ // 16 bytes loaded at this address won't cross
+ // a page boundary, so we can load it directly.
MOVOU (AX), X1
ADDL CX, CX
- PAND masks(SB)(CX*8), X1
+ PAND masks<>(SB)(CX*8), X1
JMP partial
highpartial:
- // address ends in 1xxxx. Might be up against
+ // address ends in 1111xxxx. Might be up against
// a page boundary, so load ending at last byte.
// Then shift bytes down using pshufb.
MOVOU -16(AX)(CX*1), X1
ADDL CX, CX
- PSHUFB shifts(SB)(CX*8), X1
+ PSHUFB shifts<>(SB)(CX*8), X1
partial:
// incorporate partial block into hash
AESENC X3, X0
@@ -797,7 +886,7 @@ finalize:
MOVL X0, (DX)
RET
-TEXT runtime·aeshash32(SB),7,$0
+TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
MOVL 4(SP), DX // ptr to hash value
MOVL 12(SP), AX // ptr to data
MOVL (DX), X0 // seed
@@ -808,7 +897,7 @@ TEXT runtime·aeshash32(SB),7,$0
MOVL X0, (DX)
RET
-TEXT runtime·aeshash64(SB),7,$0
+TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
MOVL 4(SP), DX // ptr to hash value
MOVL 12(SP), AX // ptr to data
MOVQ (AX), X0 // data
@@ -819,181 +908,181 @@ TEXT runtime·aeshash64(SB),7,$0
MOVL X0, (DX)
RET
-
// simple mask to get rid of data in the high part of the register.
-TEXT masks(SB),7,$0
- LONG $0x00000000
- LONG $0x00000000
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x00(SB)/4, $0x00000000
+DATA masks<>+0x04(SB)/4, $0x00000000
+DATA masks<>+0x08(SB)/4, $0x00000000
+DATA masks<>+0x0c(SB)/4, $0x00000000
- LONG $0x000000ff
- LONG $0x00000000
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x10(SB)/4, $0x000000ff
+DATA masks<>+0x14(SB)/4, $0x00000000
+DATA masks<>+0x18(SB)/4, $0x00000000
+DATA masks<>+0x1c(SB)/4, $0x00000000
- LONG $0x0000ffff
- LONG $0x00000000
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x20(SB)/4, $0x0000ffff
+DATA masks<>+0x24(SB)/4, $0x00000000
+DATA masks<>+0x28(SB)/4, $0x00000000
+DATA masks<>+0x2c(SB)/4, $0x00000000
- LONG $0x00ffffff
- LONG $0x00000000
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x30(SB)/4, $0x00ffffff
+DATA masks<>+0x34(SB)/4, $0x00000000
+DATA masks<>+0x38(SB)/4, $0x00000000
+DATA masks<>+0x3c(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0x00000000
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x40(SB)/4, $0xffffffff
+DATA masks<>+0x44(SB)/4, $0x00000000
+DATA masks<>+0x48(SB)/4, $0x00000000
+DATA masks<>+0x4c(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0x000000ff
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x50(SB)/4, $0xffffffff
+DATA masks<>+0x54(SB)/4, $0x000000ff
+DATA masks<>+0x58(SB)/4, $0x00000000
+DATA masks<>+0x5c(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0x0000ffff
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x60(SB)/4, $0xffffffff
+DATA masks<>+0x64(SB)/4, $0x0000ffff
+DATA masks<>+0x68(SB)/4, $0x00000000
+DATA masks<>+0x6c(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0x00ffffff
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x70(SB)/4, $0xffffffff
+DATA masks<>+0x74(SB)/4, $0x00ffffff
+DATA masks<>+0x78(SB)/4, $0x00000000
+DATA masks<>+0x7c(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0x80(SB)/4, $0xffffffff
+DATA masks<>+0x84(SB)/4, $0xffffffff
+DATA masks<>+0x88(SB)/4, $0x00000000
+DATA masks<>+0x8c(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0x000000ff
- LONG $0x00000000
+DATA masks<>+0x90(SB)/4, $0xffffffff
+DATA masks<>+0x94(SB)/4, $0xffffffff
+DATA masks<>+0x98(SB)/4, $0x000000ff
+DATA masks<>+0x9c(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0x0000ffff
- LONG $0x00000000
+DATA masks<>+0xa0(SB)/4, $0xffffffff
+DATA masks<>+0xa4(SB)/4, $0xffffffff
+DATA masks<>+0xa8(SB)/4, $0x0000ffff
+DATA masks<>+0xac(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0x00ffffff
- LONG $0x00000000
+DATA masks<>+0xb0(SB)/4, $0xffffffff
+DATA masks<>+0xb4(SB)/4, $0xffffffff
+DATA masks<>+0xb8(SB)/4, $0x00ffffff
+DATA masks<>+0xbc(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0x00000000
+DATA masks<>+0xc0(SB)/4, $0xffffffff
+DATA masks<>+0xc4(SB)/4, $0xffffffff
+DATA masks<>+0xc8(SB)/4, $0xffffffff
+DATA masks<>+0xcc(SB)/4, $0x00000000
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0x000000ff
+DATA masks<>+0xd0(SB)/4, $0xffffffff
+DATA masks<>+0xd4(SB)/4, $0xffffffff
+DATA masks<>+0xd8(SB)/4, $0xffffffff
+DATA masks<>+0xdc(SB)/4, $0x000000ff
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0x0000ffff
+DATA masks<>+0xe0(SB)/4, $0xffffffff
+DATA masks<>+0xe4(SB)/4, $0xffffffff
+DATA masks<>+0xe8(SB)/4, $0xffffffff
+DATA masks<>+0xec(SB)/4, $0x0000ffff
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0x00ffffff
-
- // these are arguments to pshufb. They move data down from
- // the high bytes of the register to the low bytes of the register.
- // index is how many bytes to move.
-TEXT shifts(SB),7,$0
- LONG $0x00000000
- LONG $0x00000000
- LONG $0x00000000
- LONG $0x00000000
+DATA masks<>+0xf0(SB)/4, $0xffffffff
+DATA masks<>+0xf4(SB)/4, $0xffffffff
+DATA masks<>+0xf8(SB)/4, $0xffffffff
+DATA masks<>+0xfc(SB)/4, $0x00ffffff
+
+GLOBL masks<>(SB),RODATA,$256
+
+// these are arguments to pshufb. They move data down from
+// the high bytes of the register to the low bytes of the register.
+// index is how many bytes to move.
+DATA shifts<>+0x00(SB)/4, $0x00000000
+DATA shifts<>+0x04(SB)/4, $0x00000000
+DATA shifts<>+0x08(SB)/4, $0x00000000
+DATA shifts<>+0x0c(SB)/4, $0x00000000
- LONG $0xffffff0f
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0xffffffff
+DATA shifts<>+0x10(SB)/4, $0xffffff0f
+DATA shifts<>+0x14(SB)/4, $0xffffffff
+DATA shifts<>+0x18(SB)/4, $0xffffffff
+DATA shifts<>+0x1c(SB)/4, $0xffffffff
- LONG $0xffff0f0e
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0xffffffff
+DATA shifts<>+0x20(SB)/4, $0xffff0f0e
+DATA shifts<>+0x24(SB)/4, $0xffffffff
+DATA shifts<>+0x28(SB)/4, $0xffffffff
+DATA shifts<>+0x2c(SB)/4, $0xffffffff
- LONG $0xff0f0e0d
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0xffffffff
+DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
+DATA shifts<>+0x34(SB)/4, $0xffffffff
+DATA shifts<>+0x38(SB)/4, $0xffffffff
+DATA shifts<>+0x3c(SB)/4, $0xffffffff
- LONG $0x0f0e0d0c
- LONG $0xffffffff
- LONG $0xffffffff
- LONG $0xffffffff
+DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
+DATA shifts<>+0x44(SB)/4, $0xffffffff
+DATA shifts<>+0x48(SB)/4, $0xffffffff
+DATA shifts<>+0x4c(SB)/4, $0xffffffff
- LONG $0x0e0d0c0b
- LONG $0xffffff0f
- LONG $0xffffffff
- LONG $0xffffffff
+DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
+DATA shifts<>+0x54(SB)/4, $0xffffff0f
+DATA shifts<>+0x58(SB)/4, $0xffffffff
+DATA shifts<>+0x5c(SB)/4, $0xffffffff
- LONG $0x0d0c0b0a
- LONG $0xffff0f0e
- LONG $0xffffffff
- LONG $0xffffffff
+DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
+DATA shifts<>+0x64(SB)/4, $0xffff0f0e
+DATA shifts<>+0x68(SB)/4, $0xffffffff
+DATA shifts<>+0x6c(SB)/4, $0xffffffff
- LONG $0x0c0b0a09
- LONG $0xff0f0e0d
- LONG $0xffffffff
- LONG $0xffffffff
+DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
+DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
+DATA shifts<>+0x78(SB)/4, $0xffffffff
+DATA shifts<>+0x7c(SB)/4, $0xffffffff
- LONG $0x0b0a0908
- LONG $0x0f0e0d0c
- LONG $0xffffffff
- LONG $0xffffffff
+DATA shifts<>+0x80(SB)/4, $0x0b0a0908
+DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
+DATA shifts<>+0x88(SB)/4, $0xffffffff
+DATA shifts<>+0x8c(SB)/4, $0xffffffff
- LONG $0x0a090807
- LONG $0x0e0d0c0b
- LONG $0xffffff0f
- LONG $0xffffffff
+DATA shifts<>+0x90(SB)/4, $0x0a090807
+DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
+DATA shifts<>+0x98(SB)/4, $0xffffff0f
+DATA shifts<>+0x9c(SB)/4, $0xffffffff
- LONG $0x09080706
- LONG $0x0d0c0b0a
- LONG $0xffff0f0e
- LONG $0xffffffff
+DATA shifts<>+0xa0(SB)/4, $0x09080706
+DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
+DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
+DATA shifts<>+0xac(SB)/4, $0xffffffff
- LONG $0x08070605
- LONG $0x0c0b0a09
- LONG $0xff0f0e0d
- LONG $0xffffffff
+DATA shifts<>+0xb0(SB)/4, $0x08070605
+DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
+DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
+DATA shifts<>+0xbc(SB)/4, $0xffffffff
- LONG $0x07060504
- LONG $0x0b0a0908
- LONG $0x0f0e0d0c
- LONG $0xffffffff
+DATA shifts<>+0xc0(SB)/4, $0x07060504
+DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
+DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
+DATA shifts<>+0xcc(SB)/4, $0xffffffff
- LONG $0x06050403
- LONG $0x0a090807
- LONG $0x0e0d0c0b
- LONG $0xffffff0f
+DATA shifts<>+0xd0(SB)/4, $0x06050403
+DATA shifts<>+0xd4(SB)/4, $0x0a090807
+DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
+DATA shifts<>+0xdc(SB)/4, $0xffffff0f
- LONG $0x05040302
- LONG $0x09080706
- LONG $0x0d0c0b0a
- LONG $0xffff0f0e
+DATA shifts<>+0xe0(SB)/4, $0x05040302
+DATA shifts<>+0xe4(SB)/4, $0x09080706
+DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
+DATA shifts<>+0xec(SB)/4, $0xffff0f0e
- LONG $0x04030201
- LONG $0x08070605
- LONG $0x0c0b0a09
- LONG $0xff0f0e0d
+DATA shifts<>+0xf0(SB)/4, $0x04030201
+DATA shifts<>+0xf4(SB)/4, $0x08070605
+DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
+DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
+
+GLOBL shifts<>(SB),RODATA,$256
-TEXT runtime·memeq(SB),7,$0
+TEXT runtime·memeq(SB),NOSPLIT,$0-12
MOVL a+0(FP), SI
MOVL b+4(FP), DI
MOVL count+8(FP), BX
JMP runtime·memeqbody(SB)
-
-TEXT bytes·Equal(SB),7,$0
+TEXT bytes·Equal(SB),NOSPLIT,$0-25
MOVL a_len+4(FP), BX
MOVL b_len+16(FP), CX
XORL AX, AX
@@ -1009,7 +1098,7 @@ eqret:
// a in SI
// b in DI
// count in BX
-TEXT runtime·memeqbody(SB),7,$0
+TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
XORL AX, AX
CMPL BX, $4
@@ -1101,3 +1190,166 @@ di_finish:
equal:
SETEQ AX
RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+ MOVL s1+0(FP), SI
+ MOVL s1+4(FP), BX
+ MOVL s2+8(FP), DI
+ MOVL s2+12(FP), DX
+ CALL runtime·cmpbody(SB)
+ MOVL AX, res+16(FP)
+ RET
+
+TEXT bytes·Compare(SB),NOSPLIT,$0-28
+ MOVL s1+0(FP), SI
+ MOVL s1+4(FP), BX
+ MOVL s2+12(FP), DI
+ MOVL s2+16(FP), DX
+ CALL runtime·cmpbody(SB)
+ MOVL AX, res+24(FP)
+ RET
+
+TEXT bytes·IndexByte(SB),NOSPLIT,$0
+ MOVL s+0(FP), SI
+ MOVL s_len+4(FP), CX
+ MOVB c+12(FP), AL
+ MOVL SI, DI
+ CLD; REPN; SCASB
+ JZ 3(PC)
+ MOVL $-1, ret+16(FP)
+ RET
+ SUBL SI, DI
+ SUBL $1, DI
+ MOVL DI, ret+16(FP)
+ RET
+
+TEXT strings·IndexByte(SB),NOSPLIT,$0
+ MOVL s+0(FP), SI
+ MOVL s_len+4(FP), CX
+ MOVB c+8(FP), AL
+ MOVL SI, DI
+ CLD; REPN; SCASB
+ JZ 3(PC)
+ MOVL $-1, ret+12(FP)
+ RET
+ SUBL SI, DI
+ SUBL $1, DI
+ MOVL DI, ret+12(FP)
+ RET
+
+// input:
+// SI = a
+// DI = b
+// BX = alen
+// DX = blen
+// output:
+// AX = 1/0/-1
+TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
+ CMPL SI, DI
+ JEQ cmp_allsame
+ CMPL BX, DX
+ MOVL DX, BP
+ CMOVLLT BX, BP // BP = min(alen, blen)
+ CMPL BP, $4
+ JB cmp_small
+ TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2
+ JE cmp_mediumloop
+cmp_largeloop:
+ CMPL BP, $16
+ JB cmp_mediumloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORL $0xffff, AX // convert EQ to NE
+ JNE cmp_diff16 // branch if at least one byte is not equal
+ ADDL $16, SI
+ ADDL $16, DI
+ SUBL $16, BP
+ JMP cmp_largeloop
+
+cmp_diff16:
+ BSFL AX, BX // index of first byte that differs
+ XORL AX, AX
+ MOVB (SI)(BX*1), CX
+ CMPB CX, (DI)(BX*1)
+ SETHI AX
+ LEAL -1(AX*2), AX // convert 1/0 to +1/-1
+ RET
+
+cmp_mediumloop:
+ CMPL BP, $4
+ JBE cmp_0through4
+ MOVL (SI), AX
+ MOVL (DI), CX
+ CMPL AX, CX
+ JNE cmp_diff4
+ ADDL $4, SI
+ ADDL $4, DI
+ SUBL $4, BP
+ JMP cmp_mediumloop
+
+cmp_0through4:
+ MOVL -4(SI)(BP*1), AX
+ MOVL -4(DI)(BP*1), CX
+ CMPL AX, CX
+ JEQ cmp_allsame
+
+cmp_diff4:
+ BSWAPL AX // reverse order of bytes
+ BSWAPL CX
+ XORL AX, CX // find bit differences
+ BSRL CX, CX // index of highest bit difference
+ SHRL CX, AX // move a's bit to bottom
+ ANDL $1, AX // mask bit
+ LEAL -1(AX*2), AX // 1/0 => +1/-1
+ RET
+
+ // 0-3 bytes in common
+cmp_small:
+ LEAL (BP*8), CX
+ NEGL CX
+ JEQ cmp_allsame
+
+ // load si
+ CMPB SI, $0xfc
+ JA cmp_si_high
+ MOVL (SI), SI
+ JMP cmp_si_finish
+cmp_si_high:
+ MOVL -4(SI)(BP*1), SI
+ SHRL CX, SI
+cmp_si_finish:
+ SHLL CX, SI
+
+ // same for di
+ CMPB DI, $0xfc
+ JA cmp_di_high
+ MOVL (DI), DI
+ JMP cmp_di_finish
+cmp_di_high:
+ MOVL -4(DI)(BP*1), DI
+ SHRL CX, DI
+cmp_di_finish:
+ SHLL CX, DI
+
+ BSWAPL SI // reverse order of bytes
+ BSWAPL DI
+ XORL SI, DI // find bit differences
+ JEQ cmp_allsame
+ BSRL DI, CX // index of highest bit difference
+ SHRL CX, SI // move a's bit to bottom
+ ANDL $1, SI // mask bit
+ LEAL -1(SI*2), AX // 1/0 => +1/-1
+ RET
+
+ // all the bytes in common are the same, so we just need
+ // to compare the lengths.
+cmp_allsame:
+ XORL AX, AX
+ XORL CX, CX
+ CMPL BX, DX
+ SETGT AX // 1 if alen > blen
+ SETEQ CX // 1 if alen == blen
+ LEAL -1(CX)(AX*2), AX // 1,0,-1 result
+ RET