src/internal/bytealg: optimize the function Compare on loong64

The relevant performance improved by 66.73%.

benchmark:
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000 @ 2500.00MHz
                                      │     old      │                 new                 │
                                      │    sec/op    │   sec/op     vs base                │
BytesCompare/1                           5.603n ± 0%   4.002n ± 0%  -28.57% (p=0.000 n=20)
BytesCompare/2                           6.405n ± 0%   4.002n ± 0%  -37.52% (p=0.000 n=20)
BytesCompare/4                           8.007n ± 0%   4.002n ± 0%  -50.02% (p=0.000 n=20)
BytesCompare/8                          11.210n ± 0%   4.002n ± 0%  -64.30% (p=0.000 n=20)
BytesCompare/16                          6.005n ± 0%   4.802n ± 0%  -20.03% (p=0.000 n=20)
BytesCompare/32                          6.806n ± 0%   4.402n ± 0%  -35.32% (p=0.000 n=20)
BytesCompare/64                          8.407n ± 0%   6.003n ± 0%  -28.60% (p=0.000 n=20)
BytesCompare/128                        11.610n ± 0%   8.404n ± 0%  -27.61% (p=0.000 n=20)
BytesCompare/256                         18.02n ± 0%   14.01n ± 0%  -22.25% (p=0.000 n=20)
BytesCompare/512                         31.23n ± 0%   26.98n ± 0%  -13.61% (p=0.000 n=20)
BytesCompare/1024                        56.85n ± 0%   52.43n ± 0%   -7.77% (p=0.000 n=20)
BytesCompare/2048                        108.1n ± 0%   103.8n ± 0%   -3.98% (p=0.000 n=20)
CompareBytesEqual                       15.610n ± 0%   5.203n ± 0%  -66.67% (p=0.000 n=20)
CompareBytesToNil                        3.203n ± 0%   3.202n ± 0%   -0.03% (p=0.000 n=20)
CompareBytesEmpty                        3.203n ± 0%   2.423n ± 0%  -24.35% (p=0.000 n=20)
CompareBytesIdentical                    3.203n ± 0%   2.424n ± 0%  -24.32% (p=0.000 n=20)
CompareBytesSameLength                   8.407n ± 0%   8.004n ± 0%   -4.79% (p=0.000 n=20)
CompareBytesDifferentLength              8.808n ± 0%   7.604n ± 0%  -13.67% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=1       839.85µ ± 0%   82.04µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=2       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=3       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=4       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=5       839.85µ ± 0%   82.04µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=6       839.85µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=7       839.85µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=0    78.77µ ± 0%   78.75µ ± 0%   -0.03% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=1   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=2   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=3   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=4   839.83µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=5   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=6   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=7   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBig                          78.77µ ± 0%   78.75µ ± 0%   -0.03% (p=0.001 n=20)
CompareBytesBigIdentical                 2.802n ± 0%   2.801n ± 0%   -0.04% (p=0.001 n=20)
geomean                                  1.524µ        507.2n       -66.73%

Change-Id: Ice9f4ef0ce0fbb5a6424823c5f8e0c0c369fd159
Reviewed-on: https://go-review.googlesource.com/c/go/+/589538
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Tim King <taking@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Auto-Submit: Tim King <taking@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
Xiaolin Zhao 2024-06-03 11:45:04 +08:00 committed by Gopher Robot
parent 547baafa37
commit 50e536daa1

View File

@ -28,58 +28,136 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
// R7 length of b
// R4 points to the start of a
// R6 points to the start of b
// R13 points to the return value (-1/0/1)
// for regabi the return value (-1/0/1) in R4
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
BEQ R4, R6, samebytes // same start of a and b
BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
SGTU R5, R7, R9
BNE R0, R9, r2_lt_r1
BNE R9, b_lt_a
MOVV R5, R14
JMP entry
r2_lt_r1:
MOVV R7, R14 // R14 is min(R4, R5)
b_lt_a:
MOVV R7, R14 // R14 is min(R5, R7)
entry:
ADDV R4, R14, R12 // R6 start of a, R14 end of a
BEQ R4, R12, samebytes // length is 0
ADDV R4, R14, R12 // R4 start of a, R12 end of a
BEQ R4, R12, cmp_len // minlength is 0
SRLV $4, R14 // R14 is number of chunks
BEQ R0, R14, byte_loop
tail:
MOVV $2, R15
BLT R14, R15, cmp1 // min < 2
SLLV $1, R15
BLT R14, R15, cmp2 // min < 4
SLLV $1, R15
BLT R14, R15, cmp4 // min < 8
SLLV $1, R15
BLT R14, R15, cmp8 // min < 16
SLLV $1, R15
BLT R14, R15, cmp16 // min < 32
// make sure both a and b are aligned.
OR R4, R6, R15
AND $7, R15
BNE R0, R15, byte_loop
PCALIGN $16
chunk16_loop:
BEQ R0, R14, byte_loop
// When min >= 32 bytes, enter the cmp32_loop loop processing:
// take out 4 8-bytes from a and b in turn for comparison.
cmp32_loop:
MOVV (R4), R8
MOVV (R6), R9
BNE R8, R9, byte_loop
MOVV 8(R4), R16
MOVV 8(R6), R17
MOVV 8(R4), R10
MOVV 8(R6), R11
BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
MOVV 16(R4), R8
MOVV 16(R6), R9
MOVV 24(R4), R10
MOVV 24(R6), R11
BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
ADDV $32, R4
ADDV $32, R6
SUBV $32, R14
BGE R14, R15, cmp32_loop
BEQ R14, cmp_len
check16:
MOVV $16, R15
BLT R14, R15, check8
cmp16:
MOVV (R4), R8
MOVV (R6), R9
MOVV 8(R4), R10
MOVV 8(R6), R11
BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
ADDV $16, R4
ADDV $16, R6
SUBVU $1, R14
BEQ R16, R17, chunk16_loop
SUBV $8, R4
SUBV $8, R6
SUBV $16, R14
BEQ R14, cmp_len
byte_loop:
BEQ R4, R12, samebytes
check8:
MOVV $8, R15
BLT R14, R15, check4
cmp8:
MOVV (R4), R8
MOVV (R6), R9
BNE R8, R9, cmp8a
ADDV $8, R4
ADDV $8, R6
SUBV $8, R14
BEQ R14, cmp_len
check4:
MOVV $4, R15
BLT R14, R15, check2
cmp4:
MOVW (R4), R8
MOVW (R6), R9
BNE R8, R9, cmp8a
ADDV $4, R4
ADDV $4, R6
SUBV $4, R14
BEQ R14, cmp_len
check2:
MOVV $2, R15
BLT R14, R15, cmp1
cmp2:
MOVH (R4), R8
MOVH (R6), R9
BNE R8, R9, cmp8a
ADDV $2, R4
ADDV $2, R6
SUBV $2, R14
BEQ R14, cmp_len
cmp1:
BEQ R14, cmp_len
MOVBU (R4), R8
ADDVU $1, R4
MOVBU (R6), R9
ADDVU $1, R6
BEQ R8, R9, byte_loop
BNE R8, R9, byte_cmp
JMP cmp_len
// Compare 8/4/2 bytes taken from R8/R9 that are known to differ.
cmp8a:
MOVV R8, R10
MOVV R9, R11
// Compare 8/4/2 bytes taken from R10/R11 that are known to differ.
cmp8b:
MOVV $0xff, R15
// Take single bytes from R10/R11 in turn for cyclic comparison.
cmp8_loop:
AND R10, R15, R8
AND R11, R15, R9
BNE R8, R9, byte_cmp
SLLV $8, R15
JMP cmp8_loop
// Compare 1 bytes taken from R8/R9 that are known to differ.
byte_cmp:
SGTU R8, R9, R4 // R12 = 1 if (R8 > R9)
SGTU R8, R9, R4 // R4 = 1 if (R8 > R9)
BNE R0, R4, ret
MOVV $-1, R4
JMP ret
samebytes:
cmp_len:
SGTU R5, R7, R8
SGTU R7, R5, R9
SUBV R9, R8, R4