//go:build amd64 && go1.22
// +build amd64,go1.22

#include "go_asm.h"
#include "textflag.h"

// TODO: can we use popcnt for this ???

TEXT ·IndexByteNonASCII(SB), NOSPLIT, $0-32
	MOVQ b_base+0(FP), SI
	MOVQ b_len+8(FP), BX
	LEAQ ret+24(FP), R8

	JMP indexByteBodyNonASCII<>(SB)

TEXT ·IndexNonASCII(SB), NOSPLIT, $0-24
	MOVQ s_base+0(FP), SI
	MOVQ s_len+8(FP), BX
	LEAQ ret+16(FP), R8

	JMP indexByteBodyNonASCII<>(SB)

// input:
//   SI: data
//   R8: address to put result
TEXT indexByteBodyNonASCII<>(SB), NOSPLIT, $0
	// Shuffle X0 around so that each byte contains
	// the character we're looking for.
	MOVQ      $0x80, AX
	MOVD      AX, X0
	PUNPCKLBW X0, X0
	PUNPCKLBW X0, X0
	PSHUFL    $0, X0, X0

	CMPQ BX, $16
	JLT  small

	MOVQ SI, DI

	CMPQ BX, $32
	JA   avx2

sse:
	LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
	JMP  sseloopentry

	PCALIGN $16

sseloop:
	// Move the next 16-byte chunk of the data into X1.
	MOVOU (DI), X1

	// Logical or to check for top-bit (RuneSelf)
	PAND X1, X0

	// Take the top bit of each byte in X1 and put the result in DX.
	PMOVMSKB X1, DX

	// Find first set bit, if any.
	BSFL DX, DX
	JNZ  ssesuccess

	// Advance to next block.
	ADDQ $16, DI

sseloopentry:
	CMPQ DI, AX
	JB   sseloop

	// Search the last 16-byte chunk. This chunk may overlap with the
	// chunks we've already searched, but that's ok.
	MOVQ     AX, DI
	MOVOU    (AX), X1
	PAND     X1, X0     // Check if RuneSelf is set
	PMOVMSKB X1, DX
	BSFL     DX, DX
	JNZ      ssesuccess

failure:
	MOVQ $-1, (R8)
	RET

// We've found a chunk containing the byte.
// The chunk was loaded from DI.
// The index of the matching byte in the chunk is DX.
// The start of the data is SI.
ssesuccess:
	SUBQ SI, DI   // Compute offset of chunk within data.
	ADDQ DX, DI   // Add offset of byte within chunk.
	MOVQ DI, (R8)
	RET

// handle for lengths < 16
small:
	TESTQ BX, BX
	JEQ   failure

	// Check if we'll load across a page boundary.
	LEAQ  16(SI), AX
	TESTW $0xff0, AX
	JEQ   endofpage

	MOVOU    (SI), X1 // Load data
	PAND     X1, X0   // Check if RuneSelf is set
	PMOVMSKB X1, DX   // Move result bits to integer register.
	BSFL     DX, DX   // Find first set bit.
	JZ       failure  // No set bit, failure.
	CMPL     DX, BX
	JAE      failure  // Match is past end of data.
	MOVQ     DX, (R8)
	RET

endofpage:
	MOVOU    -16(SI)(BX*1), X1 // Load data into the high end of X1.
	PAND     X1, X0            // Check if RuneSelf is set
	PMOVMSKB X1, DX            // Move result bits to integer register.
	MOVL     BX, CX
	SHLL     CX, DX
	SHRL     $16, DX           // Shift desired bits down to bottom of register.
	BSFL     DX, DX            // Find first set bit.
	JZ       failure           // No set bit, failure.
	MOVQ     DX, (R8)
	RET

avx2:
#ifndef hasAVX2
	CMPB golang·org∕x∕sys∕cpu·X86+const_offsetX86HasAVX2(SB), $1
	JNE  sse

#endif
	// Create a mask in Y4 that to check for 0x80
	MOVD         AX, X0 // $0x80 already stored in AX
	VPBROADCASTB X0, Y4

	MOVD         AX, X0
	LEAQ         -32(SI)(BX*1), R11
	VPBROADCASTB X0, Y1

	PCALIGN $32

avx2_loop:
	VMOVDQU  (DI), Y2
	VPAND    Y4, Y2, Y2  // AND for top bit 0x80
	VPCMPEQB Y1, Y2, Y3
	VPTEST   Y3, Y3
	JNZ      avx2success
	ADDQ     $32, DI
	CMPQ     DI, R11
	JLT      avx2_loop
	MOVQ     R11, DI
	VMOVDQU  (DI), Y2
	VPAND    Y4, Y2, Y2  // AND for top bit 0x80
	VPCMPEQB Y1, Y2, Y3
	VPTEST   Y3, Y3
	JNZ      avx2success
	VZEROUPPER
	MOVQ     $-1, (R8)
	RET

avx2success:
	VPMOVMSKB Y3, DX
	BSFL      DX, DX
	SUBQ      SI, DI
	ADDQ      DI, DX
	MOVQ      DX, (R8)
	VZEROUPPER
	RET
