math/big: optimize mulAddVWW function for loong64

Benchmark results on Loongson 3A5000 (which is an LA464 implementation):

goos: linux
goarch: loong64
pkg: math/big
cpu: Loongson-3A5000-HV @ 2500.00MHz
                 │ test/old_3a5000_muladdvww.log │    test/new_3a5000_muladdvww.log    │
                 │            sec/op             │   sec/op     vs base                │
MulAddVWW/1                          7.606n ± 0%   6.987n ± 0%   -8.14% (p=0.000 n=20)
MulAddVWW/2                          9.207n ± 0%   8.567n ± 0%   -6.95% (p=0.000 n=20)
MulAddVWW/3                         10.810n ± 0%   9.223n ± 0%  -14.68% (p=0.000 n=20)
MulAddVWW/4                          13.01n ± 0%   12.41n ± 0%   -4.61% (p=0.000 n=20)
MulAddVWW/5                          15.79n ± 0%   12.99n ± 0%  -17.73% (p=0.000 n=20)
MulAddVWW/10                         25.62n ± 0%   20.02n ± 0%  -21.86% (p=0.000 n=20)
MulAddVWW/100                        217.0n ± 0%   170.9n ± 0%  -21.24% (p=0.000 n=20)
MulAddVWW/1000                       2.064µ ± 0%   1.612µ ± 0%  -21.90% (p=0.000 n=20)
MulAddVWW/10000                      24.50µ ± 0%   16.74µ ± 0%  -31.66% (p=0.000 n=20)
MulAddVWW/100000                     239.1µ ± 0%   171.1µ ± 0%  -28.45% (p=0.000 n=20)
geomean                              159.2n        130.3n       -18.18%

Change-Id: I063434bc382f4f1234f879172ab671a3d6f2eb80
Reviewed-on: https://go-review.googlesource.com/c/go/+/659881
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Huang Qiqi 2024-06-19 06:31:00 +00:00 committed by abner chenc
parent 24daaeea09
commit 72fa8adbdc

View File

@ -71,8 +71,35 @@ TEXT ·lshVU(SB),NOSPLIT,$0
TEXT ·rshVU(SB),NOSPLIT,$0
JMP ·rshVU_g(SB)
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
JMP ·mulAddVWW_g(SB)
// input:
// R4: z
// R5: z_len
// R7: x
// R10: y
// R11: r
MOVV z+0(FP), R4
MOVV z_len+8(FP), R5
MOVV x+24(FP), R7
MOVV y+48(FP), R10
MOVV r+56(FP), R11
SLLV $3, R5
MOVV $0, R6
loop:
BEQ R5, R6, done
MOVV (R6)(R7), R8
MULV R8, R10, R9
MULHVU R8, R10, R12
ADDV R9, R11, R8
SGTU R9, R8, R11 // if (c' = lo + c) < lo then overflow
MOVV R8, (R6)(R4)
ADDV R12, R11
ADDV $8, R6
JMP loop
done:
MOVV R11, c+64(FP)
RET
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)