// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB), NOSPLIT, $0-40
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), BX
MOVB c+24(FP), AL
LEAQ ret+32(FP), R8
JMP indexbytebody<>(SB)
TEXT ·IndexByteString(SB), NOSPLIT, $0-32
MOVQ s_base+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
LEAQ ret+24(FP), R8
JMP indexbytebody<>(SB)
// input:
// SI: data
// BX: data len
// AL: byte sought
// R8: address to put result
TEXT indexbytebody<>(SB), NOSPLIT, $0
// Shuffle X0 around so that each byte contains
// the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
CMPQ BX, $16
JLT small
MOVQ SI, DI
CMPQ BX, $32
JA avx2
sse:
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
JMP sseloopentry
sseloop:
// Move the next 16-byte chunk of the data into X1.
MOVOU (DI), X1
// Compare bytes in X0 to X1.
PCMPEQB X0, X1
// Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX
// Find first set bit, if any.
BSFL DX, DX
JNZ ssesuccess
// Advance to next block.
ADDQ $16, DI
sseloopentry:
CMPQ DI, AX
JB sseloop
// Search the last 16-byte chunk. This chunk may overlap with the
// chunks we've already searched, but that's ok.
MOVQ AX, DI
MOVOU (AX), X1
PCMPEQB X0, X1
PMOVMSKB X1, DX
BSFL DX, DX
JNZ ssesuccess
failure:
MOVQ $-1, (R8)
RET
// We've found a chunk containing the byte.
// The chunk was loaded from DI.
// The index of the matching byte in the chunk is DX.
// The start of the data is SI.
ssesuccess:
SUBQ SI, DI // Compute offset of chunk within data.
ADDQ DX, DI // Add offset of byte within chunk.
MOVQ DI, (R8)
RET
// handle for lengths < 16
small:
TESTQ BX, BX
JEQ failure
// Check if we'll load across a page boundary.
LEAQ 16(SI), AX
TESTW $0xff0, AX
JEQ endofpage
MOVOU (SI), X1 // Load data
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
CMPL DX, BX
JAE failure // Match is past end of data.
MOVQ DX, (R8)
RET
endofpage:
MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
MOVL BX, CX
SHLL CX, DX
SHRL $16, DX // Shift desired bits down to bottom of register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
MOVQ DX, (R8)
RET
avx2:
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JNE sse
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
avx2_loop:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
ADDQ $32, DI
CMPQ DI, R11
JLT avx2_loop
MOVQ R11, DI
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
VZEROUPPER
MOVQ $-1, (R8)
RET
avx2success:
VPMOVMSKB Y3, DX
BSFL DX, DX
SUBQ SI, DI
ADDQ DI, DX
MOVQ DX, (R8)
VZEROUPPER
RET
|