From 24daaeea097701ca26edcd94fbd015b3cca518b5 Mon Sep 17 00:00:00 2001 From: Huang Qiqi Date: Tue, 11 Jun 2024 20:33:50 +0800 Subject: [PATCH] math/big: optimize subVW function for loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmark results on Loongson 3C5000 (which is an LA464 implementation): goos: linux goarch: loong64 pkg: math/big cpu: Loongson-3C5000 @ 2200.00MHz │ test/old_3c5000_subvw.log │ test/new_3c5000_subvw.log │ │ sec/op │ sec/op vs base │ SubVW/1 8.564n ± 0% 5.915n ± 0% -30.93% (p=0.000 n=20) SubVW/2 11.675n ± 0% 6.825n ± 0% -41.54% (p=0.000 n=20) SubVW/3 13.410n ± 0% 7.969n ± 0% -40.57% (p=0.000 n=20) SubVW/4 15.300n ± 0% 9.740n ± 0% -36.34% (p=0.000 n=20) SubVW/5 17.34n ± 1% 10.66n ± 0% -38.55% (p=0.000 n=20) SubVW/10 26.55n ± 0% 15.21n ± 0% -42.70% (p=0.000 n=20) SubVW/100 199.2n ± 0% 102.5n ± 0% -48.52% (p=0.000 n=20) SubVW/1000 1866.5n ± 1% 924.6n ± 0% -50.46% (p=0.000 n=20) SubVW/10000 17.67µ ± 2% 12.04µ ± 2% -31.83% (p=0.000 n=20) SubVW/100000 186.4µ ± 0% 132.0µ ± 0% -29.17% (p=0.000 n=20) SubVWext/1 8.616n ± 0% 5.949n ± 0% -30.95% (p=0.000 n=20) SubVWext/2 11.410n ± 0% 7.008n ± 1% -38.58% (p=0.000 n=20) SubVWext/3 13.255n ± 1% 8.073n ± 0% -39.09% (p=0.000 n=20) SubVWext/4 15.095n ± 0% 9.893n ± 0% -34.47% (p=0.000 n=20) SubVWext/5 16.87n ± 0% 10.86n ± 0% -35.63% (p=0.000 n=20) SubVWext/10 26.00n ± 0% 15.54n ± 0% -40.22% (p=0.000 n=20) SubVWext/100 196.0n ± 0% 104.3n ± 1% -46.76% (p=0.000 n=20) SubVWext/1000 1847.0n ± 0% 923.7n ± 0% -49.99% (p=0.000 n=20) SubVWext/10000 17.30µ ± 1% 11.71µ ± 1% -32.31% (p=0.000 n=20) SubVWext/100000 187.5µ ± 0% 131.6µ ± 0% -29.82% (p=0.000 n=20) geomean 159.7n 97.79n -38.79% Change-Id: I21a6903e79b02cb22282e80c9bfe2ae9f1a87589 Reviewed-on: https://go-review.googlesource.com/c/go/+/659878 Reviewed-by: Carlos Amedee LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: abner chenc --- src/math/big/arith_loong64.s | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s index a0130efc31..41229a1f0f 100644 --- a/src/math/big/arith_loong64.s +++ b/src/math/big/arith_loong64.s @@ -40,8 +40,30 @@ done: MOVV R10, c+56(FP) RET +// func subVW(z, x []Word, y Word) (c Word) TEXT ·subVW(SB),NOSPLIT,$0 - JMP ·subVW_g(SB) + // input: + // R4: z + // R5: z_len + // R7: x + // R10: y + MOVV z+0(FP), R4 + MOVV z_len+8(FP), R5 + MOVV x+24(FP), R7 + MOVV y+48(FP), R10 + MOVV $0, R6 + SLLV $3, R5 +loop: + BEQ R5, R6, done + MOVV (R6)(R7), R8 + SUBV R10, R8, R11 // x1 - c = z1, if z1 > x1 then overflow + SGTU R11, R8, R10 + MOVV R11, (R6)(R4) + ADDV $8, R6 + JMP loop +done: + MOVV R10, c+56(FP) + RET TEXT ·lshVU(SB),NOSPLIT,$0 JMP ·lshVU_g(SB)