src/internal/bytealg: optimize the function Compare on loong64

The relevant performance improved by 66.73%.

benchmark:
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000 @ 2500.00MHz
                                      │     old      │                 new                 │
                                      │    sec/op    │   sec/op     vs base                │
BytesCompare/1                           5.603n ± 0%   4.002n ± 0%  -28.57% (p=0.000 n=20)
BytesCompare/2                           6.405n ± 0%   4.002n ± 0%  -37.52% (p=0.000 n=20)
BytesCompare/4                           8.007n ± 0%   4.002n ± 0%  -50.02% (p=0.000 n=20)
BytesCompare/8                          11.210n ± 0%   4.002n ± 0%  -64.30% (p=0.000 n=20)
BytesCompare/16                          6.005n ± 0%   4.802n ± 0%  -20.03% (p=0.000 n=20)
BytesCompare/32                          6.806n ± 0%   4.402n ± 0%  -35.32% (p=0.000 n=20)
BytesCompare/64                          8.407n ± 0%   6.003n ± 0%  -28.60% (p=0.000 n=20)
BytesCompare/128                        11.610n ± 0%   8.404n ± 0%  -27.61% (p=0.000 n=20)
BytesCompare/256                         18.02n ± 0%   14.01n ± 0%  -22.25% (p=0.000 n=20)
BytesCompare/512                         31.23n ± 0%   26.98n ± 0%  -13.61% (p=0.000 n=20)
BytesCompare/1024                        56.85n ± 0%   52.43n ± 0%   -7.77% (p=0.000 n=20)
BytesCompare/2048                        108.1n ± 0%   103.8n ± 0%   -3.98% (p=0.000 n=20)
CompareBytesEqual                       15.610n ± 0%   5.203n ± 0%  -66.67% (p=0.000 n=20)
CompareBytesToNil                        3.203n ± 0%   3.202n ± 0%   -0.03% (p=0.000 n=20)
CompareBytesEmpty                        3.203n ± 0%   2.423n ± 0%  -24.35% (p=0.000 n=20)
CompareBytesIdentical                    3.203n ± 0%   2.424n ± 0%  -24.32% (p=0.000 n=20)
CompareBytesSameLength                   8.407n ± 0%   8.004n ± 0%   -4.79% (p=0.000 n=20)
CompareBytesDifferentLength              8.808n ± 0%   7.604n ± 0%  -13.67% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=1       839.85µ ± 0%   82.04µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=2       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=3       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=4       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=5       839.85µ ± 0%   82.04µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=6       839.85µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=7       839.85µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=0    78.77µ ± 0%   78.75µ ± 0%   -0.03% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=1   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=2   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=3   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=4   839.83µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=5   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=6   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=7   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBig                          78.77µ ± 0%   78.75µ ± 0%   -0.03% (p=0.001 n=20)
CompareBytesBigIdentical                 2.802n ± 0%   2.801n ± 0%   -0.04% (p=0.001 n=20)
geomean                                  1.524µ        507.2n       -66.73%

Change-Id: Ice9f4ef0ce0fbb5a6424823c5f8e0c0c369fd159
Reviewed-on: https://go-review.googlesource.com/c/go/+/589538
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Tim King <taking@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Auto-Submit: Tim King <taking@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
Xiaolin Zhao 2024-06-03 11:45:04 +08:00 committed by Gopher Robot
parent 547baafa37
commit 50e536daa1

View File

@ -28,58 +28,136 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
// R7 length of b // R7 length of b
// R4 points to the start of a // R4 points to the start of a
// R6 points to the start of b // R6 points to the start of b
// R13 points to the return value (-1/0/1) // for regabi the return value (-1/0/1) in R4
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
BEQ R4, R6, samebytes // same start of a and b BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
SGTU R5, R7, R9 SGTU R5, R7, R9
BNE R0, R9, r2_lt_r1 BNE R9, b_lt_a
MOVV R5, R14 MOVV R5, R14
JMP entry JMP entry
r2_lt_r1: b_lt_a:
MOVV R7, R14 // R14 is min(R4, R5) MOVV R7, R14 // R14 is min(R5, R7)
entry: entry:
ADDV R4, R14, R12 // R6 start of a, R14 end of a ADDV R4, R14, R12 // R4 start of a, R12 end of a
BEQ R4, R12, samebytes // length is 0 BEQ R4, R12, cmp_len // minlength is 0
SRLV $4, R14 // R14 is number of chunks tail:
BEQ R0, R14, byte_loop MOVV $2, R15
BLT R14, R15, cmp1 // min < 2
SLLV $1, R15
BLT R14, R15, cmp2 // min < 4
SLLV $1, R15
BLT R14, R15, cmp4 // min < 8
SLLV $1, R15
BLT R14, R15, cmp8 // min < 16
SLLV $1, R15
BLT R14, R15, cmp16 // min < 32
// make sure both a and b are aligned. // When min >= 32 bytes, enter the cmp32_loop loop processing:
OR R4, R6, R15 // take out 4 8-bytes from a and b in turn for comparison.
AND $7, R15 cmp32_loop:
BNE R0, R15, byte_loop
PCALIGN $16
chunk16_loop:
BEQ R0, R14, byte_loop
MOVV (R4), R8 MOVV (R4), R8
MOVV (R6), R9 MOVV (R6), R9
BNE R8, R9, byte_loop MOVV 8(R4), R10
MOVV 8(R4), R16 MOVV 8(R6), R11
MOVV 8(R6), R17 BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
MOVV 16(R4), R8
MOVV 16(R6), R9
MOVV 24(R4), R10
MOVV 24(R6), R11
BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
ADDV $32, R4
ADDV $32, R6
SUBV $32, R14
BGE R14, R15, cmp32_loop
BEQ R14, cmp_len
check16:
MOVV $16, R15
BLT R14, R15, check8
cmp16:
MOVV (R4), R8
MOVV (R6), R9
MOVV 8(R4), R10
MOVV 8(R6), R11
BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
ADDV $16, R4 ADDV $16, R4
ADDV $16, R6 ADDV $16, R6
SUBVU $1, R14 SUBV $16, R14
BEQ R16, R17, chunk16_loop BEQ R14, cmp_len
SUBV $8, R4
SUBV $8, R6
byte_loop: check8:
BEQ R4, R12, samebytes MOVV $8, R15
BLT R14, R15, check4
cmp8:
MOVV (R4), R8
MOVV (R6), R9
BNE R8, R9, cmp8a
ADDV $8, R4
ADDV $8, R6
SUBV $8, R14
BEQ R14, cmp_len
check4:
MOVV $4, R15
BLT R14, R15, check2
cmp4:
MOVW (R4), R8
MOVW (R6), R9
BNE R8, R9, cmp8a
ADDV $4, R4
ADDV $4, R6
SUBV $4, R14
BEQ R14, cmp_len
check2:
MOVV $2, R15
BLT R14, R15, cmp1
cmp2:
MOVH (R4), R8
MOVH (R6), R9
BNE R8, R9, cmp8a
ADDV $2, R4
ADDV $2, R6
SUBV $2, R14
BEQ R14, cmp_len
cmp1:
BEQ R14, cmp_len
MOVBU (R4), R8 MOVBU (R4), R8
ADDVU $1, R4
MOVBU (R6), R9 MOVBU (R6), R9
ADDVU $1, R6 BNE R8, R9, byte_cmp
BEQ R8, R9, byte_loop JMP cmp_len
// Compare 8/4/2 bytes taken from R8/R9 that are known to differ.
cmp8a:
MOVV R8, R10
MOVV R9, R11
// Compare 8/4/2 bytes taken from R10/R11 that are known to differ.
cmp8b:
MOVV $0xff, R15
// Take single bytes from R10/R11 in turn for cyclic comparison.
cmp8_loop:
AND R10, R15, R8
AND R11, R15, R9
BNE R8, R9, byte_cmp
SLLV $8, R15
JMP cmp8_loop
// Compare 1 bytes taken from R8/R9 that are known to differ.
byte_cmp: byte_cmp:
SGTU R8, R9, R4 // R12 = 1 if (R8 > R9) SGTU R8, R9, R4 // R4 = 1 if (R8 > R9)
BNE R0, R4, ret BNE R0, R4, ret
MOVV $-1, R4 MOVV $-1, R4
JMP ret JMP ret
samebytes: cmp_len:
SGTU R5, R7, R8 SGTU R5, R7, R8
SGTU R7, R5, R9 SGTU R7, R5, R9
SUBV R9, R8, R4 SUBV R9, R8, R4