diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s index 830b09bd2c..8f570e8eae 100644 --- a/src/internal/bytealg/equal_loong64.s +++ b/src/internal/bytealg/equal_loong64.s @@ -8,37 +8,266 @@ #define REGCTXT R29 // memequal(a, b unsafe.Pointer, size uintptr) bool -TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 - BEQ R4, R5, eq - ADDV R4, R6, R7 - PCALIGN $16 -loop: - BNE R4, R7, test - MOVV $1, R4 - RET -test: - MOVBU (R4), R9 - ADDV $1, R4 - MOVBU (R5), R10 - ADDV $1, R5 - BEQ R9, R10, loop - - MOVB R0, R4 - RET -eq: - MOVV $1, R4 - RET +TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0 + // R4 = a_base + // R5 = b_base + // R6 = size + JMP equalbody<>(SB) // memequal_varlen(a, b unsafe.Pointer) bool -TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 - BEQ R4, R5, eq +TEXT runtime·memequal_varlen(SB),NOSPLIT,$0 + // R4 = a_base + // R5 = b_base MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure - MOVV R4, 8(R3) - MOVV R5, 16(R3) - MOVV R6, 24(R3) - JAL runtime·memequal(SB) - MOVBU 32(R3), R4 - RET -eq: + JMP equalbody<>(SB) + +// input: +// R4 = a_base +// R5 = b_base +// R6 = size +TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0 + // a_base == b_base + BEQ R4, R5, equal + // 0 bytes + BEQ R6, equal + + MOVV $64, R7 + BGE R6, R7, lasx + + // size < 64 bytes +tail: + MOVV $16, R7 + BLT R6, R7, lt_16 +generic16_loop: + ADDV $-16, R6 + MOVV 0(R4), R8 + MOVV 8(R4), R9 + MOVV 0(R5), R10 + MOVV 8(R5), R11 + BNE R8, R10, not_equal + BNE R9, R11, not_equal + BEQ R6, equal + ADDV $16, R4 + ADDV $16, R5 + BGE R6, R7, generic16_loop + + // size < 16 bytes +lt_16: + MOVV $8, R7 + BLT R6, R7, lt_8 + ADDV $-8, R6 + MOVV 0(R4), R8 + MOVV 0(R5), R9 + BNE R8, R9, not_equal + BEQ R6, equal + ADDV $8, R4 + ADDV $8, R5 + + // size < 8 bytes +lt_8: + MOVV $4, R7 + BLT R6, R7, lt_4 + ADDV $-4, R6 + MOVW 0(R4), R8 + MOVW 0(R5), R9 + BNE R8, R9, not_equal + BEQ R6, equal + ADDV $4, R4 + ADDV $4, R5 + + // size < 4 bytes +lt_4: + MOVV $2, R7 + BLT R6, R7, lt_2 + ADDV $-2, R6 + MOVH 0(R4), R8 + MOVH 0(R5), R9 + BNE R8, R9, not_equal + BEQ R6, equal + ADDV $2, R4 + ADDV $2, R5 + + // size < 2 bytes +lt_2: + MOVB 0(R4), R8 + MOVB 0(R5), R9 + BNE R8, R9, not_equal + +equal: MOVV $1, R4 RET + +not_equal: + MOVV R0, R4 + RET + + // Implemented using 256-bit SIMD instructions +lasx: + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7 + BEQ R7, lsx + +lasx256: + MOVV $256, R7 + BLT R6, R7, lasx64 +lasx256_loop: + ADDV $-256, R6 + XVMOVQ 0(R4), X0 + XVMOVQ 32(R4), X1 + XVMOVQ 64(R4), X2 + XVMOVQ 96(R4), X3 + XVMOVQ 128(R4), X4 + XVMOVQ 160(R4), X5 + XVMOVQ 192(R4), X6 + XVMOVQ 224(R4), X7 + XVMOVQ 0(R5), X8 + XVMOVQ 32(R5), X9 + XVMOVQ 64(R5), X10 + XVMOVQ 96(R5), X11 + XVMOVQ 128(R5), X12 + XVMOVQ 160(R5), X13 + XVMOVQ 192(R5), X14 + XVMOVQ 224(R5), X15 + XVSEQV X0, X8, X0 + XVSEQV X1, X9, X1 + XVSEQV X2, X10, X2 + XVSEQV X3, X11, X3 + XVSEQV X4, X12, X4 + XVSEQV X5, X13, X5 + XVSEQV X6, X14, X6 + XVSEQV X7, X15, X7 + XVANDV X0, X1, X0 + XVANDV X2, X3, X2 + XVANDV X4, X5, X4 + XVANDV X6, X7, X6 + XVANDV X0, X2, X0 + XVANDV X4, X6, X4 + XVANDV X0, X4, X0 + XVSETALLNEV X0, FCC0 + BFPF not_equal + BEQ R6, equal + ADDV $256, R4 + ADDV $256, R5 + BGE R6, R7, lasx256_loop + +lasx64: + MOVV $64, R7 + BLT R6, R7, tail +lasx64_loop: + ADDV $-64, R6 + XVMOVQ 0(R4), X0 + XVMOVQ 32(R4), X1 + XVMOVQ 0(R5), X2 + XVMOVQ 32(R5), X3 + XVSEQV X0, X2, X0 + XVSEQV X1, X3, X1 + XVANDV X0, X1, X0 + XVSETALLNEV X0, FCC0 + BFPF not_equal + BEQ R6, equal + ADDV $64, R4 + ADDV $64, R5 + BGE R6, R7, lasx64_loop + JMP tail + + // Implemented using 128-bit SIMD instructions +lsx: + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7 + BEQ R7, generic64_loop + +lsx128: + MOVV $128, R7 + BLT R6, R7, lsx32 +lsx128_loop: + ADDV $-128, R6 + VMOVQ 0(R4), V0 + VMOVQ 16(R4), V1 + VMOVQ 32(R4), V2 + VMOVQ 48(R4), V3 + VMOVQ 64(R4), V4 + VMOVQ 80(R4), V5 + VMOVQ 96(R4), V6 + VMOVQ 112(R4), V7 + VMOVQ 0(R5), V8 + VMOVQ 16(R5), V9 + VMOVQ 32(R5), V10 + VMOVQ 48(R5), V11 + VMOVQ 64(R5), V12 + VMOVQ 80(R5), V13 + VMOVQ 96(R5), V14 + VMOVQ 112(R5), V15 + VSEQV V0, V8, V0 + VSEQV V1, V9, V1 + VSEQV V2, V10, V2 + VSEQV V3, V11, V3 + VSEQV V4, V12, V4 + VSEQV V5, V13, V5 + VSEQV V6, V14, V6 + VSEQV V7, V15, V7 + VANDV V0, V1, V0 + VANDV V2, V3, V2 + VANDV V4, V5, V4 + VANDV V6, V7, V6 + VANDV V0, V2, V0 + VANDV V4, V6, V4 + VANDV V0, V4, V0 + VSETALLNEV V0, FCC0 + BFPF not_equal + BEQ R6, equal + + ADDV $128, R4 + ADDV $128, R5 + BGE R6, R7, lsx128_loop + +lsx32: + MOVV $32, R7 + BLT R6, R7, tail +lsx32_loop: + ADDV $-32, R6 + VMOVQ 0(R4), V0 + VMOVQ 16(R4), V1 + VMOVQ 0(R5), V2 + VMOVQ 16(R5), V3 + VSEQV V0, V2, V0 + VSEQV V1, V3, V1 + VANDV V0, V1, V0 + VSETALLNEV V0, FCC0 + BFPF not_equal + BEQ R6, equal + ADDV $32, R4 + ADDV $32, R5 + BGE R6, R7, lsx32_loop + JMP tail + + // Implemented using general instructions +generic64_loop: + ADDV $-64, R6 + MOVV 0(R4), R7 + MOVV 8(R4), R8 + MOVV 16(R4), R9 + MOVV 24(R4), R10 + MOVV 0(R5), R15 + MOVV 8(R5), R16 + MOVV 16(R5), R17 + MOVV 24(R5), R18 + BNE R7, R15, not_equal + BNE R8, R16, not_equal + BNE R9, R17, not_equal + BNE R10, R18, not_equal + MOVV 32(R4), R11 + MOVV 40(R4), R12 + MOVV 48(R4), R13 + MOVV 56(R4), R14 + MOVV 32(R5), R19 + MOVV 40(R5), R20 + MOVV 48(R5), R21 + MOVV 56(R5), R23 + BNE R11, R19, not_equal + BNE R12, R20, not_equal + BNE R13, R21, not_equal + BNE R14, R23, not_equal + BEQ R6, equal + ADDV $64, R4 + ADDV $64, R5 + MOVV $64, R7 + BGE R6, R7, generic64_loop + JMP tail