mirror of
https://github.com/golang/go.git
synced 2025-05-07 16:43:03 +00:00
src/internal/bytealg: optimize the function Compare on loong64
The relevant performance improved by 66.73%. benchmark: goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000 @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ BytesCompare/1 5.603n ± 0% 4.002n ± 0% -28.57% (p=0.000 n=20) BytesCompare/2 6.405n ± 0% 4.002n ± 0% -37.52% (p=0.000 n=20) BytesCompare/4 8.007n ± 0% 4.002n ± 0% -50.02% (p=0.000 n=20) BytesCompare/8 11.210n ± 0% 4.002n ± 0% -64.30% (p=0.000 n=20) BytesCompare/16 6.005n ± 0% 4.802n ± 0% -20.03% (p=0.000 n=20) BytesCompare/32 6.806n ± 0% 4.402n ± 0% -35.32% (p=0.000 n=20) BytesCompare/64 8.407n ± 0% 6.003n ± 0% -28.60% (p=0.000 n=20) BytesCompare/128 11.610n ± 0% 8.404n ± 0% -27.61% (p=0.000 n=20) BytesCompare/256 18.02n ± 0% 14.01n ± 0% -22.25% (p=0.000 n=20) BytesCompare/512 31.23n ± 0% 26.98n ± 0% -13.61% (p=0.000 n=20) BytesCompare/1024 56.85n ± 0% 52.43n ± 0% -7.77% (p=0.000 n=20) BytesCompare/2048 108.1n ± 0% 103.8n ± 0% -3.98% (p=0.000 n=20) CompareBytesEqual 15.610n ± 0% 5.203n ± 0% -66.67% (p=0.000 n=20) CompareBytesToNil 3.203n ± 0% 3.202n ± 0% -0.03% (p=0.000 n=20) CompareBytesEmpty 3.203n ± 0% 2.423n ± 0% -24.35% (p=0.000 n=20) CompareBytesIdentical 3.203n ± 0% 2.424n ± 0% -24.32% (p=0.000 n=20) CompareBytesSameLength 8.407n ± 0% 8.004n ± 0% -4.79% (p=0.000 n=20) CompareBytesDifferentLength 8.808n ± 0% 7.604n ± 0% -13.67% (p=0.000 n=20) CompareBytesBigUnaligned/offset=1 839.85µ ± 0% 82.04µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=2 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=3 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=4 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=5 839.85µ ± 0% 82.04µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=6 839.85µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=7 839.85µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=0 78.77µ ± 0% 78.75µ ± 0% -0.03% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=1 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=2 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=3 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=4 839.83µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=5 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=6 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=7 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBig 78.77µ ± 0% 78.75µ ± 0% -0.03% (p=0.001 n=20) CompareBytesBigIdentical 2.802n ± 0% 2.801n ± 0% -0.04% (p=0.001 n=20) geomean 1.524µ 507.2n -66.73% Change-Id: Ice9f4ef0ce0fbb5a6424823c5f8e0c0c369fd159 Reviewed-on: https://go-review.googlesource.com/c/go/+/589538 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Tim King <taking@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Auto-Submit: Tim King <taking@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
parent
547baafa37
commit
50e536daa1
@ -28,58 +28,136 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
|
|||||||
// R7 length of b
|
// R7 length of b
|
||||||
// R4 points to the start of a
|
// R4 points to the start of a
|
||||||
// R6 points to the start of b
|
// R6 points to the start of b
|
||||||
// R13 points to the return value (-1/0/1)
|
// for regabi the return value (-1/0/1) in R4
|
||||||
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
|
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
|
||||||
BEQ R4, R6, samebytes // same start of a and b
|
BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
|
||||||
|
|
||||||
SGTU R5, R7, R9
|
SGTU R5, R7, R9
|
||||||
BNE R0, R9, r2_lt_r1
|
BNE R9, b_lt_a
|
||||||
MOVV R5, R14
|
MOVV R5, R14
|
||||||
JMP entry
|
JMP entry
|
||||||
r2_lt_r1:
|
b_lt_a:
|
||||||
MOVV R7, R14 // R14 is min(R4, R5)
|
MOVV R7, R14 // R14 is min(R5, R7)
|
||||||
entry:
|
entry:
|
||||||
ADDV R4, R14, R12 // R6 start of a, R14 end of a
|
ADDV R4, R14, R12 // R4 start of a, R12 end of a
|
||||||
BEQ R4, R12, samebytes // length is 0
|
BEQ R4, R12, cmp_len // minlength is 0
|
||||||
|
|
||||||
SRLV $4, R14 // R14 is number of chunks
|
tail:
|
||||||
BEQ R0, R14, byte_loop
|
MOVV $2, R15
|
||||||
|
BLT R14, R15, cmp1 // min < 2
|
||||||
|
SLLV $1, R15
|
||||||
|
BLT R14, R15, cmp2 // min < 4
|
||||||
|
SLLV $1, R15
|
||||||
|
BLT R14, R15, cmp4 // min < 8
|
||||||
|
SLLV $1, R15
|
||||||
|
BLT R14, R15, cmp8 // min < 16
|
||||||
|
SLLV $1, R15
|
||||||
|
BLT R14, R15, cmp16 // min < 32
|
||||||
|
|
||||||
// make sure both a and b are aligned.
|
// When min >= 32 bytes, enter the cmp32_loop loop processing:
|
||||||
OR R4, R6, R15
|
// take out 4 8-bytes from a and b in turn for comparison.
|
||||||
AND $7, R15
|
cmp32_loop:
|
||||||
BNE R0, R15, byte_loop
|
|
||||||
|
|
||||||
PCALIGN $16
|
|
||||||
chunk16_loop:
|
|
||||||
BEQ R0, R14, byte_loop
|
|
||||||
MOVV (R4), R8
|
MOVV (R4), R8
|
||||||
MOVV (R6), R9
|
MOVV (R6), R9
|
||||||
BNE R8, R9, byte_loop
|
MOVV 8(R4), R10
|
||||||
MOVV 8(R4), R16
|
MOVV 8(R6), R11
|
||||||
MOVV 8(R6), R17
|
BNE R8, R9, cmp8a
|
||||||
|
BNE R10, R11, cmp8b
|
||||||
|
MOVV 16(R4), R8
|
||||||
|
MOVV 16(R6), R9
|
||||||
|
MOVV 24(R4), R10
|
||||||
|
MOVV 24(R6), R11
|
||||||
|
BNE R8, R9, cmp8a
|
||||||
|
BNE R10, R11, cmp8b
|
||||||
|
ADDV $32, R4
|
||||||
|
ADDV $32, R6
|
||||||
|
SUBV $32, R14
|
||||||
|
BGE R14, R15, cmp32_loop
|
||||||
|
BEQ R14, cmp_len
|
||||||
|
|
||||||
|
check16:
|
||||||
|
MOVV $16, R15
|
||||||
|
BLT R14, R15, check8
|
||||||
|
cmp16:
|
||||||
|
MOVV (R4), R8
|
||||||
|
MOVV (R6), R9
|
||||||
|
MOVV 8(R4), R10
|
||||||
|
MOVV 8(R6), R11
|
||||||
|
BNE R8, R9, cmp8a
|
||||||
|
BNE R10, R11, cmp8b
|
||||||
ADDV $16, R4
|
ADDV $16, R4
|
||||||
ADDV $16, R6
|
ADDV $16, R6
|
||||||
SUBVU $1, R14
|
SUBV $16, R14
|
||||||
BEQ R16, R17, chunk16_loop
|
BEQ R14, cmp_len
|
||||||
SUBV $8, R4
|
|
||||||
SUBV $8, R6
|
|
||||||
|
|
||||||
byte_loop:
|
check8:
|
||||||
BEQ R4, R12, samebytes
|
MOVV $8, R15
|
||||||
|
BLT R14, R15, check4
|
||||||
|
cmp8:
|
||||||
|
MOVV (R4), R8
|
||||||
|
MOVV (R6), R9
|
||||||
|
BNE R8, R9, cmp8a
|
||||||
|
ADDV $8, R4
|
||||||
|
ADDV $8, R6
|
||||||
|
SUBV $8, R14
|
||||||
|
BEQ R14, cmp_len
|
||||||
|
|
||||||
|
check4:
|
||||||
|
MOVV $4, R15
|
||||||
|
BLT R14, R15, check2
|
||||||
|
cmp4:
|
||||||
|
MOVW (R4), R8
|
||||||
|
MOVW (R6), R9
|
||||||
|
BNE R8, R9, cmp8a
|
||||||
|
ADDV $4, R4
|
||||||
|
ADDV $4, R6
|
||||||
|
SUBV $4, R14
|
||||||
|
BEQ R14, cmp_len
|
||||||
|
|
||||||
|
check2:
|
||||||
|
MOVV $2, R15
|
||||||
|
BLT R14, R15, cmp1
|
||||||
|
cmp2:
|
||||||
|
MOVH (R4), R8
|
||||||
|
MOVH (R6), R9
|
||||||
|
BNE R8, R9, cmp8a
|
||||||
|
ADDV $2, R4
|
||||||
|
ADDV $2, R6
|
||||||
|
SUBV $2, R14
|
||||||
|
BEQ R14, cmp_len
|
||||||
|
|
||||||
|
cmp1:
|
||||||
|
BEQ R14, cmp_len
|
||||||
MOVBU (R4), R8
|
MOVBU (R4), R8
|
||||||
ADDVU $1, R4
|
|
||||||
MOVBU (R6), R9
|
MOVBU (R6), R9
|
||||||
ADDVU $1, R6
|
BNE R8, R9, byte_cmp
|
||||||
BEQ R8, R9, byte_loop
|
JMP cmp_len
|
||||||
|
|
||||||
|
// Compare 8/4/2 bytes taken from R8/R9 that are known to differ.
|
||||||
|
cmp8a:
|
||||||
|
MOVV R8, R10
|
||||||
|
MOVV R9, R11
|
||||||
|
|
||||||
|
// Compare 8/4/2 bytes taken from R10/R11 that are known to differ.
|
||||||
|
cmp8b:
|
||||||
|
MOVV $0xff, R15
|
||||||
|
|
||||||
|
// Take single bytes from R10/R11 in turn for cyclic comparison.
|
||||||
|
cmp8_loop:
|
||||||
|
AND R10, R15, R8
|
||||||
|
AND R11, R15, R9
|
||||||
|
BNE R8, R9, byte_cmp
|
||||||
|
SLLV $8, R15
|
||||||
|
JMP cmp8_loop
|
||||||
|
|
||||||
|
// Compare 1 bytes taken from R8/R9 that are known to differ.
|
||||||
byte_cmp:
|
byte_cmp:
|
||||||
SGTU R8, R9, R4 // R12 = 1 if (R8 > R9)
|
SGTU R8, R9, R4 // R4 = 1 if (R8 > R9)
|
||||||
BNE R0, R4, ret
|
BNE R0, R4, ret
|
||||||
MOVV $-1, R4
|
MOVV $-1, R4
|
||||||
JMP ret
|
JMP ret
|
||||||
|
|
||||||
samebytes:
|
cmp_len:
|
||||||
SGTU R5, R7, R8
|
SGTU R5, R7, R8
|
||||||
SGTU R7, R5, R9
|
SGTU R7, R5, R9
|
||||||
SUBV R9, R8, R4
|
SUBV R9, R8, R4
|
||||||
|
Loading…
x
Reference in New Issue
Block a user