mirror of
https://github.com/golang/go.git
synced 2025-05-05 15:43:04 +00:00
runtime: optimize the function memequal using SIMD on loong64
goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000-HV @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ Equal/0 0.4012n ± 0% 0.4003n ± 0% -0.21% (p=0.000 n=10) Equal/same/1 2.555n ± 1% 2.419n ± 0% -5.32% (p=0.000 n=10) Equal/same/6 2.574n ± 1% 2.425n ± 1% -5.79% (p=0.000 n=10) Equal/same/9 2.578n ± 0% 2.419n ± 1% -6.19% (p=0.000 n=10) Equal/same/15 2.565n ± 1% 2.417n ± 0% -5.73% (p=0.000 n=10) Equal/same/16 2.576n ± 1% 2.414n ± 0% -6.31% (p=0.000 n=10) Equal/same/20 2.573n ± 1% 2.416n ± 0% -6.10% (p=0.000 n=10) Equal/same/32 2.559n ± 0% 2.411n ± 0% -5.80% (p=0.000 n=10) Equal/same/4K 2.579n ± 1% 2.410n ± 0% -6.53% (p=0.000 n=10) Equal/same/4M 2.571n ± 0% 2.411n ± 0% -6.22% (p=0.000 n=10) Equal/same/64M 2.568n ± 1% 2.413n ± 0% -6.05% (p=0.000 n=10) Equal/1 5.215n ± 0% 6.404n ± 0% +22.80% (p=0.000 n=10) Equal/6 11.630n ± 0% 6.404n ± 0% -44.94% (p=0.000 n=10) Equal/9 15.240n ± 0% 6.404n ± 0% -57.98% (p=0.000 n=10) Equal/15 22.925n ± 0% 6.404n ± 0% -72.07% (p=0.000 n=10) Equal/16 24.070n ± 0% 5.203n ± 0% -78.38% (p=0.000 n=10) Equal/20 28.880n ± 0% 6.404n ± 0% -77.83% (p=0.000 n=10) Equal/32 43.320n ± 0% 6.404n ± 0% -85.22% (p=0.000 n=10) Equal/4K 4938.50n ± 0% 55.43n ± 0% -98.88% (p=0.000 n=10) Equal/4M 5048.8µ ± 0% 202.0µ ± 0% -96.00% (p=0.000 n=10) Equal/64M 80.819m ± 0% 4.539m ± 0% -94.38% (p=0.000 n=10) EqualBothUnaligned/64_0 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10) EqualBothUnaligned/64_1 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10) EqualBothUnaligned/64_4 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10) EqualBothUnaligned/64_7 79.830n ± 0% 4.803n ± 0% -93.98% (p=0.000 n=10) EqualBothUnaligned/4096_0 4937.00n ± 0% 65.64n ± 0% -98.67% (p=0.000 n=10) EqualBothUnaligned/4096_1 4937.00n ± 0% 78.85n ± 0% -98.40% (p=0.000 n=10) EqualBothUnaligned/4096_4 4937.00n ± 0% 78.87n ± 0% -98.40% (p=0.000 n=10) EqualBothUnaligned/4096_7 4937.00n ± 0% 78.87n ± 0% -98.40% (p=0.000 n=10) EqualBothUnaligned/4194304_0 5049.2µ ± 0% 204.2µ ± 0% -95.96% (p=0.000 n=10) EqualBothUnaligned/4194304_1 5049.2µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10) EqualBothUnaligned/4194304_4 5049.4µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10) EqualBothUnaligned/4194304_7 5049.2µ ± 0% 205.1µ ± 0% -95.94% (p=0.000 n=10) EqualBothUnaligned/67108864_0 80.796m ± 0% 3.863m ± 0% -95.22% (p=0.000 n=10) EqualBothUnaligned/67108864_1 80.801m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10) EqualBothUnaligned/67108864_4 80.799m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10) EqualBothUnaligned/67108864_7 80.781m ± 0% 3.706m ± 0% -95.41% (p=0.000 n=10) geomean 1.040µ 149.6n -85.63% Change-Id: Id4c2bc0ca758337dd9759df83750c761814be488 Reviewed-on: https://go-review.googlesource.com/c/go/+/667255 Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
This commit is contained in:
parent
93e4e26d5b
commit
ecdd429a3b
@ -8,37 +8,266 @@
|
||||
#define REGCTXT R29
|
||||
|
||||
// memequal(a, b unsafe.Pointer, size uintptr) bool
|
||||
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
|
||||
BEQ R4, R5, eq
|
||||
ADDV R4, R6, R7
|
||||
PCALIGN $16
|
||||
loop:
|
||||
BNE R4, R7, test
|
||||
MOVV $1, R4
|
||||
RET
|
||||
test:
|
||||
MOVBU (R4), R9
|
||||
ADDV $1, R4
|
||||
MOVBU (R5), R10
|
||||
ADDV $1, R5
|
||||
BEQ R9, R10, loop
|
||||
|
||||
MOVB R0, R4
|
||||
RET
|
||||
eq:
|
||||
MOVV $1, R4
|
||||
RET
|
||||
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
|
||||
// R4 = a_base
|
||||
// R5 = b_base
|
||||
// R6 = size
|
||||
JMP equalbody<>(SB)
|
||||
|
||||
// memequal_varlen(a, b unsafe.Pointer) bool
|
||||
TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$40-17
|
||||
BEQ R4, R5, eq
|
||||
TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0
|
||||
// R4 = a_base
|
||||
// R5 = b_base
|
||||
MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure
|
||||
MOVV R4, 8(R3)
|
||||
MOVV R5, 16(R3)
|
||||
MOVV R6, 24(R3)
|
||||
JAL runtime·memequal(SB)
|
||||
MOVBU 32(R3), R4
|
||||
RET
|
||||
eq:
|
||||
JMP equalbody<>(SB)
|
||||
|
||||
// input:
|
||||
// R4 = a_base
|
||||
// R5 = b_base
|
||||
// R6 = size
|
||||
TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0
|
||||
// a_base == b_base
|
||||
BEQ R4, R5, equal
|
||||
// 0 bytes
|
||||
BEQ R6, equal
|
||||
|
||||
MOVV $64, R7
|
||||
BGE R6, R7, lasx
|
||||
|
||||
// size < 64 bytes
|
||||
tail:
|
||||
MOVV $16, R7
|
||||
BLT R6, R7, lt_16
|
||||
generic16_loop:
|
||||
ADDV $-16, R6
|
||||
MOVV 0(R4), R8
|
||||
MOVV 8(R4), R9
|
||||
MOVV 0(R5), R10
|
||||
MOVV 8(R5), R11
|
||||
BNE R8, R10, not_equal
|
||||
BNE R9, R11, not_equal
|
||||
BEQ R6, equal
|
||||
ADDV $16, R4
|
||||
ADDV $16, R5
|
||||
BGE R6, R7, generic16_loop
|
||||
|
||||
// size < 16 bytes
|
||||
lt_16:
|
||||
MOVV $8, R7
|
||||
BLT R6, R7, lt_8
|
||||
ADDV $-8, R6
|
||||
MOVV 0(R4), R8
|
||||
MOVV 0(R5), R9
|
||||
BNE R8, R9, not_equal
|
||||
BEQ R6, equal
|
||||
ADDV $8, R4
|
||||
ADDV $8, R5
|
||||
|
||||
// size < 8 bytes
|
||||
lt_8:
|
||||
MOVV $4, R7
|
||||
BLT R6, R7, lt_4
|
||||
ADDV $-4, R6
|
||||
MOVW 0(R4), R8
|
||||
MOVW 0(R5), R9
|
||||
BNE R8, R9, not_equal
|
||||
BEQ R6, equal
|
||||
ADDV $4, R4
|
||||
ADDV $4, R5
|
||||
|
||||
// size < 4 bytes
|
||||
lt_4:
|
||||
MOVV $2, R7
|
||||
BLT R6, R7, lt_2
|
||||
ADDV $-2, R6
|
||||
MOVH 0(R4), R8
|
||||
MOVH 0(R5), R9
|
||||
BNE R8, R9, not_equal
|
||||
BEQ R6, equal
|
||||
ADDV $2, R4
|
||||
ADDV $2, R5
|
||||
|
||||
// size < 2 bytes
|
||||
lt_2:
|
||||
MOVB 0(R4), R8
|
||||
MOVB 0(R5), R9
|
||||
BNE R8, R9, not_equal
|
||||
|
||||
equal:
|
||||
MOVV $1, R4
|
||||
RET
|
||||
|
||||
not_equal:
|
||||
MOVV R0, R4
|
||||
RET
|
||||
|
||||
// Implemented using 256-bit SIMD instructions
|
||||
lasx:
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
|
||||
BEQ R7, lsx
|
||||
|
||||
lasx256:
|
||||
MOVV $256, R7
|
||||
BLT R6, R7, lasx64
|
||||
lasx256_loop:
|
||||
ADDV $-256, R6
|
||||
XVMOVQ 0(R4), X0
|
||||
XVMOVQ 32(R4), X1
|
||||
XVMOVQ 64(R4), X2
|
||||
XVMOVQ 96(R4), X3
|
||||
XVMOVQ 128(R4), X4
|
||||
XVMOVQ 160(R4), X5
|
||||
XVMOVQ 192(R4), X6
|
||||
XVMOVQ 224(R4), X7
|
||||
XVMOVQ 0(R5), X8
|
||||
XVMOVQ 32(R5), X9
|
||||
XVMOVQ 64(R5), X10
|
||||
XVMOVQ 96(R5), X11
|
||||
XVMOVQ 128(R5), X12
|
||||
XVMOVQ 160(R5), X13
|
||||
XVMOVQ 192(R5), X14
|
||||
XVMOVQ 224(R5), X15
|
||||
XVSEQV X0, X8, X0
|
||||
XVSEQV X1, X9, X1
|
||||
XVSEQV X2, X10, X2
|
||||
XVSEQV X3, X11, X3
|
||||
XVSEQV X4, X12, X4
|
||||
XVSEQV X5, X13, X5
|
||||
XVSEQV X6, X14, X6
|
||||
XVSEQV X7, X15, X7
|
||||
XVANDV X0, X1, X0
|
||||
XVANDV X2, X3, X2
|
||||
XVANDV X4, X5, X4
|
||||
XVANDV X6, X7, X6
|
||||
XVANDV X0, X2, X0
|
||||
XVANDV X4, X6, X4
|
||||
XVANDV X0, X4, X0
|
||||
XVSETALLNEV X0, FCC0
|
||||
BFPF not_equal
|
||||
BEQ R6, equal
|
||||
ADDV $256, R4
|
||||
ADDV $256, R5
|
||||
BGE R6, R7, lasx256_loop
|
||||
|
||||
lasx64:
|
||||
MOVV $64, R7
|
||||
BLT R6, R7, tail
|
||||
lasx64_loop:
|
||||
ADDV $-64, R6
|
||||
XVMOVQ 0(R4), X0
|
||||
XVMOVQ 32(R4), X1
|
||||
XVMOVQ 0(R5), X2
|
||||
XVMOVQ 32(R5), X3
|
||||
XVSEQV X0, X2, X0
|
||||
XVSEQV X1, X3, X1
|
||||
XVANDV X0, X1, X0
|
||||
XVSETALLNEV X0, FCC0
|
||||
BFPF not_equal
|
||||
BEQ R6, equal
|
||||
ADDV $64, R4
|
||||
ADDV $64, R5
|
||||
BGE R6, R7, lasx64_loop
|
||||
JMP tail
|
||||
|
||||
// Implemented using 128-bit SIMD instructions
|
||||
lsx:
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
|
||||
BEQ R7, generic64_loop
|
||||
|
||||
lsx128:
|
||||
MOVV $128, R7
|
||||
BLT R6, R7, lsx32
|
||||
lsx128_loop:
|
||||
ADDV $-128, R6
|
||||
VMOVQ 0(R4), V0
|
||||
VMOVQ 16(R4), V1
|
||||
VMOVQ 32(R4), V2
|
||||
VMOVQ 48(R4), V3
|
||||
VMOVQ 64(R4), V4
|
||||
VMOVQ 80(R4), V5
|
||||
VMOVQ 96(R4), V6
|
||||
VMOVQ 112(R4), V7
|
||||
VMOVQ 0(R5), V8
|
||||
VMOVQ 16(R5), V9
|
||||
VMOVQ 32(R5), V10
|
||||
VMOVQ 48(R5), V11
|
||||
VMOVQ 64(R5), V12
|
||||
VMOVQ 80(R5), V13
|
||||
VMOVQ 96(R5), V14
|
||||
VMOVQ 112(R5), V15
|
||||
VSEQV V0, V8, V0
|
||||
VSEQV V1, V9, V1
|
||||
VSEQV V2, V10, V2
|
||||
VSEQV V3, V11, V3
|
||||
VSEQV V4, V12, V4
|
||||
VSEQV V5, V13, V5
|
||||
VSEQV V6, V14, V6
|
||||
VSEQV V7, V15, V7
|
||||
VANDV V0, V1, V0
|
||||
VANDV V2, V3, V2
|
||||
VANDV V4, V5, V4
|
||||
VANDV V6, V7, V6
|
||||
VANDV V0, V2, V0
|
||||
VANDV V4, V6, V4
|
||||
VANDV V0, V4, V0
|
||||
VSETALLNEV V0, FCC0
|
||||
BFPF not_equal
|
||||
BEQ R6, equal
|
||||
|
||||
ADDV $128, R4
|
||||
ADDV $128, R5
|
||||
BGE R6, R7, lsx128_loop
|
||||
|
||||
lsx32:
|
||||
MOVV $32, R7
|
||||
BLT R6, R7, tail
|
||||
lsx32_loop:
|
||||
ADDV $-32, R6
|
||||
VMOVQ 0(R4), V0
|
||||
VMOVQ 16(R4), V1
|
||||
VMOVQ 0(R5), V2
|
||||
VMOVQ 16(R5), V3
|
||||
VSEQV V0, V2, V0
|
||||
VSEQV V1, V3, V1
|
||||
VANDV V0, V1, V0
|
||||
VSETALLNEV V0, FCC0
|
||||
BFPF not_equal
|
||||
BEQ R6, equal
|
||||
ADDV $32, R4
|
||||
ADDV $32, R5
|
||||
BGE R6, R7, lsx32_loop
|
||||
JMP tail
|
||||
|
||||
// Implemented using general instructions
|
||||
generic64_loop:
|
||||
ADDV $-64, R6
|
||||
MOVV 0(R4), R7
|
||||
MOVV 8(R4), R8
|
||||
MOVV 16(R4), R9
|
||||
MOVV 24(R4), R10
|
||||
MOVV 0(R5), R15
|
||||
MOVV 8(R5), R16
|
||||
MOVV 16(R5), R17
|
||||
MOVV 24(R5), R18
|
||||
BNE R7, R15, not_equal
|
||||
BNE R8, R16, not_equal
|
||||
BNE R9, R17, not_equal
|
||||
BNE R10, R18, not_equal
|
||||
MOVV 32(R4), R11
|
||||
MOVV 40(R4), R12
|
||||
MOVV 48(R4), R13
|
||||
MOVV 56(R4), R14
|
||||
MOVV 32(R5), R19
|
||||
MOVV 40(R5), R20
|
||||
MOVV 48(R5), R21
|
||||
MOVV 56(R5), R23
|
||||
BNE R11, R19, not_equal
|
||||
BNE R12, R20, not_equal
|
||||
BNE R13, R21, not_equal
|
||||
BNE R14, R23, not_equal
|
||||
BEQ R6, equal
|
||||
ADDV $64, R4
|
||||
ADDV $64, R5
|
||||
MOVV $64, R7
|
||||
BGE R6, R7, generic64_loop
|
||||
JMP tail
|
||||
|
Loading…
x
Reference in New Issue
Block a user