summaryrefslogtreecommitdiff
path: root/src/pkg/bytes/asm_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/bytes/asm_amd64.s')
-rw-r--r--src/pkg/bytes/asm_amd64.s93
1 files changed, 84 insertions, 9 deletions
diff --git a/src/pkg/bytes/asm_amd64.s b/src/pkg/bytes/asm_amd64.s
index 7e78700ec..c6793cbdc 100644
--- a/src/pkg/bytes/asm_amd64.s
+++ b/src/pkg/bytes/asm_amd64.s
@@ -3,15 +3,90 @@
// license that can be found in the LICENSE file.
TEXT ·IndexByte(SB),7,$0
- MOVQ p+0(FP), SI
- MOVL len+8(FP), CX
- MOVB b+16(FP), AL
- MOVQ SI, DI
+ MOVQ p+0(FP), SI
+ MOVL len+8(FP), BX
+ MOVB b+16(FP), AL
+ MOVQ SI, DI
+
+ CMPL BX, $16
+ JLT small
+
+ // round up to first 16-byte boundary
+ TESTQ $15, SI
+ JZ aligned
+ MOVQ SI, CX
+ ANDQ $~15, CX
+ ADDQ $16, CX
+
+ // search the beginning
+ SUBQ SI, CX
+ REPN; SCASB
+ JZ success
+
+// DI is 16-byte aligned; get ready to search using SSE instructions
+aligned:
+ // round down to last 16-byte boundary
+ MOVQ BX, R11
+ ADDQ SI, R11
+ ANDQ $~15, R11
+
+ // shuffle X0 around so that each byte contains c
+ MOVD AX, X0
+ PUNPCKLBW X0, X0
+ PUNPCKLBW X0, X0
+ PSHUFL $0, X0, X0
+ JMP condition
+
+sse:
+ // move the next 16-byte chunk of the buffer into X1
+ MOVO (DI), X1
+ // compare bytes in X0 to X1
+ PCMPEQB X0, X1
+ // take the top bit of each byte in X1 and put the result in DX
+ PMOVMSKB X1, DX
+ TESTL DX, DX
+ JNZ ssesuccess
+ ADDQ $16, DI
+
+condition:
+ CMPQ DI, R11
+ JLT sse
+
+ // search the end
+ MOVQ SI, CX
+ ADDQ BX, CX
+ SUBQ R11, CX
+ // if CX == 0, the zero flag will be set and we'll end up
+ // returning a false success
+ JZ failure
REPN; SCASB
- JZ 3(PC)
- MOVL $-1, ret+24(FP)
+ JZ success
+
+failure:
+ MOVL $-1, ret+24(FP)
+ RET
+
+// handle for lengths < 16
+small:
+ MOVL BX, CX
+ REPN; SCASB
+ JZ success
+ MOVL $-1, ret+24(FP)
RET
- SUBQ SI, DI
- SUBL $1, DI
- MOVL DI, ret+24(FP)
+
+// we've found the chunk containing the byte
+// now just figure out which specific byte it is
+ssesuccess:
+ // get the index of the least significant set bit
+ BSFW DX, DX
+ SUBQ SI, DI
+ ADDQ DI, DX
+ MOVL DX, ret+24(FP)
+ RET
+
+success:
+ SUBQ SI, DI
+ SUBL $1, DI
+ MOVL DI, ret+24(FP)
RET
+