math/big: replace assembly with mini-compiler output

Step 4 of the mini-compiler: switch to the new generated assembly.
No systematic performance regressions, and many many improvements.

In the benchmarks, the systems are:

	c3h88     GOARCH=amd64     c3h88 perf gomote (newer Intel, Google Cloud)
	c2s16     GOARCH=amd64     c2s16 perf gomote (Intel, Google Cloud)
	s7        GOARCH=amd64     rsc basement server (AMD Ryzen 9 7950X)
	386       GOARCH=386       gotip-linux-386 gomote (Intel, Google Cloud)
	s7-386    GOARCH=386       rsc basement server (AMD Ryzen 9 7950X)
	c4as16    GOARCH=arm64     c4as16 perf gomote (Google Cloud)
	mac       GOARCH=arm64     Apple M3 Pro in MacBook Pro
	arm       GOARCH=arm       gotip-linux-arm gomote
	loong64   GOARCH=loong64   gotip-linux-loong64 gomote
	ppc64le   GOARCH=ppc64le   gotip-linux-ppc64le gomote
	riscv64   GOARCH=riscv64   gotip-linux-riscv64 gomote
	s390x     GOARCH=s390x     linux-s390x-ibm old gomote

benchmark \ system           c3h88    c2s16       s7      386   s7-386   c4as16      mac      arm  loong64  ppc64le  riscv64    s390x
AddVV/words=1               -4.03%   +5.21%   -4.04%   +4.94%        ~        ~        ~        ~  -19.51%        ~        ~        ~
AddVV/words=10             -10.20%   +0.34%   -3.46%  -11.50%   -7.46%   +7.66%   +5.97%        ~  -17.90%        ~        ~        ~
AddVV/words=16             -10.91%   -6.45%   -8.45%  -21.86%  -17.90%   +2.73%   -1.61%        ~  -22.47%   -3.54%        ~        ~
AddVV/words=100             -3.77%   -4.30%   -3.17%  -47.27%  -45.34%   -0.78%        ~   -8.74%  -27.19%        ~        ~        ~
AddVV/words=1000            -0.08%   -0.71%        ~  -49.21%  -48.07%        ~        ~  -16.80%  -24.74%        ~        ~        ~
AddVV/words=10000                ~        ~        ~  -48.73%  -48.56%   -0.06%        ~  -17.08%        ~        ~   -4.81%        ~
AddVV/words=100000               ~        ~        ~  -47.80%  -48.38%        ~        ~  -15.10%  -25.06%        ~   -5.34%        ~
SubVV/words=1               -0.84%   +3.43%   -3.62%   +1.34%        ~   -0.76%        ~        ~  -18.18%   +5.58%        ~        ~
SubVV/words=10              -9.99%   +0.34%        ~  -11.23%   -8.24%   +7.53%   +6.15%        ~  -17.55%   +2.77%   -2.08%        ~
SubVV/words=16             -11.94%   -6.45%   -6.81%  -21.82%  -18.11%   +1.58%   -1.21%        ~  -20.36%        ~        ~        ~
SubVV/words=100             -3.38%   -4.32%   -1.80%  -46.14%  -46.43%   +0.41%        ~   -7.20%  -26.17%        ~   -0.42%        ~
SubVV/words=1000            -0.38%   -0.80%        ~  -49.22%  -48.90%        ~        ~  -15.86%  -24.73%        ~        ~        ~
SubVV/words=10000                ~        ~        ~  -49.57%  -49.64%   -0.03%        ~  -15.85%  -26.52%        ~   -5.05%        ~
SubVV/words=100000               ~        ~        ~  -46.88%  -49.66%        ~        ~  -15.45%  -16.11%        ~   -4.99%        ~
LshVU/words=1                    ~   +5.78%        ~        ~   -2.48%   +1.61%   +2.18%   +2.70%  -18.16%  -34.16%  -21.29%        ~
LshVU/words=10             -18.34%   -3.78%   +2.21%        ~        ~   -2.81%  -12.54%        ~  -25.02%  -24.78%  -38.11%  -66.98%
LshVU/words=16             -23.15%   +1.03%   +7.74%   +0.73%        ~   +8.88%   +1.56%        ~  -25.37%  -28.46%  -41.27%        ~
LshVU/words=100            -32.85%   -8.86%   -2.58%        ~   +2.69%   +1.24%        ~  -20.63%  -44.14%  -42.68%  -53.09%        ~
LshVU/words=1000           -37.30%   -0.20%   +5.67%        ~        ~   +1.44%        ~  -27.83%  -45.01%  -37.07%  -57.02%  -46.57%
LshVU/words=10000          -36.84%   -2.30%   +3.82%        ~   +1.86%   +1.57%  -66.81%  -28.00%  -13.15%  -35.40%  -41.97%        ~
LshVU/words=100000         -40.30%        ~   +3.96%        ~        ~        ~        ~  -24.91%  -19.06%  -36.14%  -40.99%  -66.03%
RshVU/words=1               -3.17%   +4.76%   -4.06%   +4.31%   +4.55%        ~        ~        ~  -20.61%        ~  -26.20%  -51.33%
RshVU/words=10             -22.08%   -4.41%  -17.99%   +3.64%  -11.87%        ~  -16.30%        ~  -30.01%        ~  -40.37%  -63.05%
RshVU/words=16             -26.03%   -8.50%  -18.09%        ~  -17.52%   +6.50%        ~   -2.85%  -30.24%        ~  -42.93%  -63.13%
RshVU/words=100            -20.87%  -28.83%  -29.45%        ~  -26.25%   +1.46%   -1.14%  -16.20%  -45.65%  -16.20%  -53.66%  -77.27%
RshVU/words=1000           -24.03%  -21.37%  -26.71%        ~  -28.95%   +0.98%        ~  -18.82%  -45.21%  -23.55%  -57.09%  -71.18%
RshVU/words=10000          -24.56%  -22.44%  -27.01%        ~  -28.88%   +0.78%   -5.35%  -17.47%  -16.87%  -20.67%  -41.97%        ~
RshVU/words=100000         -23.36%  -15.65%  -27.54%        ~  -29.26%   +1.73%   -6.67%  -13.68%  -21.40%  -23.02%  -40.37%  -66.31%
MulAddVWW/words=1           +2.37%   +8.14%        ~   +4.10%   +3.71%        ~        ~        ~  -21.62%        ~   +1.12%        ~
MulAddVWW/words=10               ~   -2.72%  -15.15%   +8.04%        ~        ~        ~   -2.52%  -19.48%        ~   -6.18%        ~
MulAddVWW/words=16               ~   +1.49%        ~   +4.49%   +6.58%   -8.70%   -7.16%  -12.08%  -21.43%   -6.59%   -9.05%        ~
MulAddVWW/words=100         +0.37%   +1.11%   -4.51%  -13.59%        ~  -11.10%   -3.63%  -21.40%  -22.27%   -2.92%  -14.41%        ~
MulAddVWW/words=1000             ~   +0.90%   -7.13%  -18.94%        ~  -14.02%   -9.97%  -28.31%  -18.72%   -2.32%  -15.80%        ~
MulAddVWW/words=10000            ~   +1.08%   -6.75%  -19.10%        ~  -14.61%   -9.04%  -28.48%  -14.29%   -2.25%   -9.40%        ~
MulAddVWW/words=100000           ~        ~   -6.93%  -18.09%        ~  -14.33%   -9.66%  -28.92%  -16.63%   -2.43%   -8.23%        ~
AddMulVVWW/words=1          +2.30%   +4.83%  -11.37%   +4.58%        ~   -3.14%        ~        ~  -10.58%  +30.35%        ~        ~
AddMulVVWW/words=10         -3.27%        ~   +8.96%   +5.74%        ~   +2.67%   -1.44%   -7.64%  -13.41%        ~        ~        ~
AddMulVVWW/words=16         -6.12%        ~        ~        ~   +1.91%   -7.90%  -16.22%  -14.07%  -14.26%   -4.15%   -7.30%        ~
AddMulVVWW/words=100        -5.48%   -2.14%        ~   -9.40%   +9.98%   -1.43%  -12.35%  -18.56%  -21.94%        ~   -9.84%        ~
AddMulVVWW/words=1000      -11.35%   -3.40%   -3.64%  -11.04%  +12.82%   -1.33%  -15.63%  -20.50%  -20.95%        ~  -11.06%  -51.97%
AddMulVVWW/words=10000     -10.31%   -1.61%   -8.41%  -12.15%  +13.10%   -1.03%  -16.34%  -22.46%   -1.00%        ~  -10.33%  -49.80%
AddMulVVWW/words=100000    -13.71%        ~   -8.31%  -12.18%  +12.98%   -1.35%  -15.20%  -21.89%        ~        ~   -9.38%  -48.30%

Change-Id: I0a33c33602c0d053c84d9946e662500cfa048e2d
Reviewed-on: https://go-review.googlesource.com/c/go/+/664938
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Russ Cox 2025-04-10 17:01:24 -04:00 committed by Gopher Robot
parent 39070da4f8
commit 7f516a31b0
17 changed files with 4200 additions and 2523 deletions

View File

@ -1,192 +1,240 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), CX
MOVL z_len+4(FP), BP
MOVL $0, BX // i = 0
MOVL $0, DX // c = 0
JMP E1
L1: MOVL (SI)(BX*4), AX
ADDL DX, DX // restore CF
ADCL (CX)(BX*4), AX
SBBL DX, DX // save CF
MOVL AX, (DI)(BX*4)
ADDL $1, BX // i++
E1: CMPL BX, BP // i < n
JL L1
NEGL DX
TEXT ·addVV(SB), NOSPLIT, $0
MOVL z_len+4(FP), BX
MOVL x_base+12(FP), SI
MOVL y_base+24(FP), DI
MOVL z_base+0(FP), BP
// compute unrolled loop lengths
MOVL BX, CX
ANDL $3, CX
SHRL $2, BX
MOVL $0, DX // clear saved carry
loop1:
TESTL CX, CX; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
ADDL DX, DX // restore carry
MOVL 0(SI), DX
ADCL 0(DI), DX
MOVL DX, 0(BP)
SBBL DX, DX // save carry
LEAL 4(SI), SI // ADD $4, SI
LEAL 4(DI), DI // ADD $4, DI
LEAL 4(BP), BP // ADD $4, BP
SUBL $1, CX; JNZ loop1cont
loop1done:
loop4:
TESTL BX, BX; JZ loop4done
loop4cont:
// unroll 4X in batches of 1
ADDL DX, DX // restore carry
MOVL 0(SI), CX
ADCL 0(DI), CX
MOVL CX, 0(BP)
MOVL 4(SI), CX
ADCL 4(DI), CX
MOVL CX, 4(BP)
MOVL 8(SI), CX
ADCL 8(DI), CX
MOVL CX, 8(BP)
MOVL 12(SI), CX
ADCL 12(DI), CX
MOVL CX, 12(BP)
SBBL DX, DX // save carry
LEAL 16(SI), SI // ADD $16, SI
LEAL 16(DI), DI // ADD $16, DI
LEAL 16(BP), BP // ADD $16, BP
SUBL $1, BX; JNZ loop4cont
loop4done:
NEGL DX // convert add carry
MOVL DX, c+36(FP)
RET
// func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SBBL instead of ADCL and label names)
TEXT ·subVV(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), CX
MOVL z_len+4(FP), BP
MOVL $0, BX // i = 0
MOVL $0, DX // c = 0
JMP E2
L2: MOVL (SI)(BX*4), AX
ADDL DX, DX // restore CF
SBBL (CX)(BX*4), AX
SBBL DX, DX // save CF
MOVL AX, (DI)(BX*4)
ADDL $1, BX // i++
E2: CMPL BX, BP // i < n
JL L2
NEGL DX
TEXT ·subVV(SB), NOSPLIT, $0
MOVL z_len+4(FP), BX
MOVL x_base+12(FP), SI
MOVL y_base+24(FP), DI
MOVL z_base+0(FP), BP
// compute unrolled loop lengths
MOVL BX, CX
ANDL $3, CX
SHRL $2, BX
MOVL $0, DX // clear saved carry
loop1:
TESTL CX, CX; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
ADDL DX, DX // restore carry
MOVL 0(SI), DX
SBBL 0(DI), DX
MOVL DX, 0(BP)
SBBL DX, DX // save carry
LEAL 4(SI), SI // ADD $4, SI
LEAL 4(DI), DI // ADD $4, DI
LEAL 4(BP), BP // ADD $4, BP
SUBL $1, CX; JNZ loop1cont
loop1done:
loop4:
TESTL BX, BX; JZ loop4done
loop4cont:
// unroll 4X in batches of 1
ADDL DX, DX // restore carry
MOVL 0(SI), CX
SBBL 0(DI), CX
MOVL CX, 0(BP)
MOVL 4(SI), CX
SBBL 4(DI), CX
MOVL CX, 4(BP)
MOVL 8(SI), CX
SBBL 8(DI), CX
MOVL CX, 8(BP)
MOVL 12(SI), CX
SBBL 12(DI), CX
MOVL CX, 12(BP)
SBBL DX, DX // save carry
LEAL 16(SI), SI // ADD $16, SI
LEAL 16(DI), DI // ADD $16, DI
LEAL 16(BP), BP // ADD $16, BP
SUBL $1, BX; JNZ loop4cont
loop4done:
NEGL DX // convert sub carry
MOVL DX, c+36(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVL z_len+4(FP), BX // i = z
SUBL $1, BX // i--
JL X8b // i < 0 (n <= 0)
// n > 0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
TEXT ·lshVU(SB), NOSPLIT, $0
MOVL z_len+4(FP), BX
TESTL BX, BX; JZ ret0
MOVL s+24(FP), CX
MOVL (SI)(BX*4), AX // w1 = x[n-1]
MOVL x_base+12(FP), SI
MOVL z_base+0(FP), DI
// run loop backward, using counter as positive index
// shift first word into carry
MOVL -4(SI)(BX*4), BP
MOVL $0, DX
SHLL CX, AX, DX // w1>>ŝ
SHLL CX, BP, DX
MOVL DX, c+28(FP)
CMPL BX, $0
JLE X8a // i <= 0
// i > 0
L8: MOVL AX, DX // w = w1
MOVL -4(SI)(BX*4), AX // w1 = x[i-1]
SHLL CX, AX, DX // w<<s | w1>>ŝ
MOVL DX, (DI)(BX*4) // z[i] = w<<s | w1>>ŝ
SUBL $1, BX // i--
JG L8 // i > 0
// i <= 0
X8a: SHLL CX, AX // w1<<s
MOVL AX, (DI) // z[0] = w1<<s
// shift remaining words
SUBL $1, BX
loop1:
TESTL BX, BX; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
MOVL -4(SI)(BX*4), DX
SHLL CX, DX, BP
MOVL BP, 0(DI)(BX*4)
MOVL DX, BP
SUBL $1, BX; JNZ loop1cont
loop1done:
// store final shifted bits
SHLL CX, BP
MOVL BP, 0(DI)(BX*4)
RET
X8b: MOVL $0, c+28(FP)
ret0:
MOVL $0, c+28(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB),NOSPLIT,$0
MOVL z_len+4(FP), BP
SUBL $1, BP // n--
JL X9b // n < 0 (n <= 0)
// n > 0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
TEXT ·rshVU(SB), NOSPLIT, $0
MOVL z_len+4(FP), BX
TESTL BX, BX; JZ ret0
MOVL s+24(FP), CX
MOVL (SI), AX // w1 = x[0]
MOVL x_base+12(FP), SI
MOVL z_base+0(FP), DI
// use counter as negative index
LEAL (SI)(BX*4), SI
LEAL (DI)(BX*4), DI
NEGL BX
// shift first word into carry
MOVL 0(SI)(BX*4), BP
MOVL $0, DX
SHRL CX, AX, DX // w1<<ŝ
SHRL CX, BP, DX
MOVL DX, c+28(FP)
MOVL $0, BX // i = 0
JMP E9
// i < n-1
L9: MOVL AX, DX // w = w1
MOVL 4(SI)(BX*4), AX // w1 = x[i+1]
SHRL CX, AX, DX // w>>s | w1<<ŝ
MOVL DX, (DI)(BX*4) // z[i] = w>>s | w1<<ŝ
ADDL $1, BX // i++
E9: CMPL BX, BP
JL L9 // i < n-1
// i >= n-1
X9a: SHRL CX, AX // w1>>s
MOVL AX, (DI)(BP*4) // z[n-1] = w1>>s
// shift remaining words
ADDL $1, BX
loop1:
TESTL BX, BX; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
MOVL 0(SI)(BX*4), DX
SHRL CX, DX, BP
MOVL BP, -4(DI)(BX*4)
MOVL DX, BP
ADDL $1, BX; JNZ loop1cont
loop1done:
// store final shifted bits
SHRL CX, BP
MOVL BP, -4(DI)(BX*4)
RET
X9b: MOVL $0, c+28(FP)
ret0:
MOVL $0, c+28(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL m+24(FP), BP
MOVL a+28(FP), CX // c = a
MOVL z_len+4(FP), BX
LEAL (DI)(BX*4), DI
LEAL (SI)(BX*4), SI
NEGL BX // i = -n
JMP E5
L5: MOVL (SI)(BX*4), AX
MULL BP
ADDL CX, AX
ADCL $0, DX
MOVL AX, (DI)(BX*4)
MOVL DX, CX
ADDL $1, BX // i++
E5: CMPL BX, $0 // i < 0
JL L5
MOVL CX, c+32(FP)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVL m+24(FP), BX
MOVL a+28(FP), SI
MOVL z_len+4(FP), DI
MOVL x_base+12(FP), BP
MOVL z_base+0(FP), CX
// use counter as negative index
LEAL (BP)(DI*4), BP
LEAL (CX)(DI*4), CX
NEGL DI
loop1:
TESTL DI, DI; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
MOVL 0(BP)(DI*4), AX
// multiply
MULL BX
ADDL SI, AX
MOVL DX, SI
ADCL $0, SI
MOVL AX, 0(CX)(DI*4)
ADDL $1, DI; JNZ loop1cont
loop1done:
MOVL SI, c+32(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
MOVL z+0(FP), BP
MOVL x+12(FP), DI
MOVL y+24(FP), SI
MOVL a+40(FP), CX
MOVL z_len+4(FP), BX
LEAL (DI)(BX*4), DI
LEAL (SI)(BX*4), SI
LEAL (BP)(BX*4), BP
NEGL BX // i = -n
JMP E6
L6: MOVL (SI)(BX*4), AX
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVL a+40(FP), BX
MOVL z_len+4(FP), SI
MOVL x_base+12(FP), DI
MOVL y_base+24(FP), BP
MOVL z_base+0(FP), CX
// use counter as negative index
LEAL (DI)(SI*4), DI
LEAL (BP)(SI*4), BP
LEAL (CX)(SI*4), CX
NEGL SI
loop1:
TESTL SI, SI; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
MOVL 0(BP)(SI*4), AX
// multiply
MULL m+36(FP)
ADDL CX, AX
ADCL $0, DX
ADDL (DI)(BX*4), AX
MOVL AX, (BP)(BX*4)
ADCL $0, DX
MOVL DX, CX
ADDL $1, BX // i++
E6: CMPL BX, $0 // i < 0
JL L6
MOVL CX, c+44(FP)
ADDL BX, AX
MOVL DX, BX
ADCL $0, BX
// add
ADDL 0(DI)(SI*4), AX
ADCL $0, BX
MOVL AX, 0(CX)(SI*4)
ADDL $1, SI; JNZ loop1cont
loop1done:
MOVL BX, c+44(FP)
RET

View File

@ -8,4 +8,4 @@ package big
import "internal/cpu"
var support_adx = cpu.X86.HasADX && cpu.X86.HasBMI2
var hasADX = cpu.X86.HasADX && cpu.X86.HasBMI2

View File

@ -1,408 +1,462 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
// This is faster than using rotate instructions.
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
MOVQ z_len+8(FP), DI
MOVQ x+24(FP), R8
MOVQ y+48(FP), R9
MOVQ z+0(FP), R10
MOVQ $0, CX // c = 0
MOVQ $0, SI // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUBQ $4, DI // n -= 4
JL V1 // if n < 0 goto V1
U1: // n >= 0
// regular loop body unrolled 4x
ADDQ CX, CX // restore CF
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
ADCQ 0(R9)(SI*8), R11
ADCQ 8(R9)(SI*8), R12
ADCQ 16(R9)(SI*8), R13
ADCQ 24(R9)(SI*8), R14
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
SBBQ CX, CX // save CF
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
JGE U1 // if n >= 0 goto U1
V1: ADDQ $4, DI // n += 4
JLE E1 // if n <= 0 goto E1
L1: // n > 0
ADDQ CX, CX // restore CF
MOVQ 0(R8)(SI*8), R11
ADCQ 0(R9)(SI*8), R11
MOVQ R11, 0(R10)(SI*8)
SBBQ CX, CX // save CF
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JG L1 // if n > 0 goto L1
E1: NEGQ CX
MOVQ CX, c+72(FP) // return c
TEXT ·addVV(SB), NOSPLIT, $0
MOVQ z_len+8(FP), BX
MOVQ x_base+24(FP), SI
MOVQ y_base+48(FP), DI
MOVQ z_base+0(FP), R8
// compute unrolled loop lengths
MOVQ BX, R9
ANDQ $3, R9
SHRQ $2, BX
MOVQ $0, R10 // clear saved carry
loop1:
TESTQ R9, R9; JZ loop1done
loop1cont:
// unroll 1X
ADDQ R10, R10 // restore carry
MOVQ 0(SI), R10
ADCQ 0(DI), R10
MOVQ R10, 0(R8)
SBBQ R10, R10 // save carry
LEAQ 8(SI), SI // ADD $8, SI
LEAQ 8(DI), DI // ADD $8, DI
LEAQ 8(R8), R8 // ADD $8, R8
SUBQ $1, R9; JNZ loop1cont
loop1done:
loop4:
TESTQ BX, BX; JZ loop4done
loop4cont:
// unroll 4X
ADDQ R10, R10 // restore carry
MOVQ 0(SI), R9
MOVQ 8(SI), R10
MOVQ 16(SI), R11
MOVQ 24(SI), R12
ADCQ 0(DI), R9
ADCQ 8(DI), R10
ADCQ 16(DI), R11
ADCQ 24(DI), R12
MOVQ R9, 0(R8)
MOVQ R10, 8(R8)
MOVQ R11, 16(R8)
MOVQ R12, 24(R8)
SBBQ R10, R10 // save carry
LEAQ 32(SI), SI // ADD $32, SI
LEAQ 32(DI), DI // ADD $32, DI
LEAQ 32(R8), R8 // ADD $32, R8
SUBQ $1, BX; JNZ loop4cont
loop4done:
NEGQ R10 // convert add carry
MOVQ R10, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SBBQ instead of ADCQ and label names)
TEXT ·subVV(SB),NOSPLIT,$0
MOVQ z_len+8(FP), DI
MOVQ x+24(FP), R8
MOVQ y+48(FP), R9
MOVQ z+0(FP), R10
MOVQ $0, CX // c = 0
MOVQ $0, SI // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUBQ $4, DI // n -= 4
JL V2 // if n < 0 goto V2
U2: // n >= 0
// regular loop body unrolled 4x
ADDQ CX, CX // restore CF
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
SBBQ 0(R9)(SI*8), R11
SBBQ 8(R9)(SI*8), R12
SBBQ 16(R9)(SI*8), R13
SBBQ 24(R9)(SI*8), R14
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
SBBQ CX, CX // save CF
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
JGE U2 // if n >= 0 goto U2
V2: ADDQ $4, DI // n += 4
JLE E2 // if n <= 0 goto E2
L2: // n > 0
ADDQ CX, CX // restore CF
MOVQ 0(R8)(SI*8), R11
SBBQ 0(R9)(SI*8), R11
MOVQ R11, 0(R10)(SI*8)
SBBQ CX, CX // save CF
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JG L2 // if n > 0 goto L2
E2: NEGQ CX
MOVQ CX, c+72(FP) // return c
TEXT ·subVV(SB), NOSPLIT, $0
MOVQ z_len+8(FP), BX
MOVQ x_base+24(FP), SI
MOVQ y_base+48(FP), DI
MOVQ z_base+0(FP), R8
// compute unrolled loop lengths
MOVQ BX, R9
ANDQ $3, R9
SHRQ $2, BX
MOVQ $0, R10 // clear saved carry
loop1:
TESTQ R9, R9; JZ loop1done
loop1cont:
// unroll 1X
ADDQ R10, R10 // restore carry
MOVQ 0(SI), R10
SBBQ 0(DI), R10
MOVQ R10, 0(R8)
SBBQ R10, R10 // save carry
LEAQ 8(SI), SI // ADD $8, SI
LEAQ 8(DI), DI // ADD $8, DI
LEAQ 8(R8), R8 // ADD $8, R8
SUBQ $1, R9; JNZ loop1cont
loop1done:
loop4:
TESTQ BX, BX; JZ loop4done
loop4cont:
// unroll 4X
ADDQ R10, R10 // restore carry
MOVQ 0(SI), R9
MOVQ 8(SI), R10
MOVQ 16(SI), R11
MOVQ 24(SI), R12
SBBQ 0(DI), R9
SBBQ 8(DI), R10
SBBQ 16(DI), R11
SBBQ 24(DI), R12
MOVQ R9, 0(R8)
MOVQ R10, 8(R8)
MOVQ R11, 16(R8)
MOVQ R12, 24(R8)
SBBQ R10, R10 // save carry
LEAQ 32(SI), SI // ADD $32, SI
LEAQ 32(DI), DI // ADD $32, DI
LEAQ 32(R8), R8 // ADD $32, R8
SUBQ $1, BX; JNZ loop4cont
loop4done:
NEGQ R10 // convert sub carry
MOVQ R10, c+72(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVQ z_len+8(FP), BX // i = z
SUBQ $1, BX // i--
JL X8b // i < 0 (n <= 0)
// n > 0
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
TEXT ·lshVU(SB), NOSPLIT, $0
MOVQ z_len+8(FP), BX
TESTQ BX, BX; JZ ret0
MOVQ s+48(FP), CX
MOVQ (R8)(BX*8), AX // w1 = x[n-1]
MOVQ $0, DX
SHLQ CX, AX, DX // w1>>ŝ
MOVQ DX, c+56(FP)
CMPQ BX, $0
JLE X8a // i <= 0
// i > 0
L8: MOVQ AX, DX // w = w1
MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
SHLQ CX, AX, DX // w<<s | w1>>ŝ
MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
SUBQ $1, BX // i--
JG L8 // i > 0
// i <= 0
X8a: SHLQ CX, AX // w1<<s
MOVQ AX, (R10) // z[0] = w1<<s
MOVQ x_base+24(FP), SI
MOVQ z_base+0(FP), DI
// run loop backward
LEAQ (SI)(BX*8), SI
LEAQ (DI)(BX*8), DI
// shift first word into carry
MOVQ -8(SI), R8
MOVQ $0, R9
SHLQ CX, R8, R9
MOVQ R9, c+56(FP)
// shift remaining words
SUBQ $1, BX
// compute unrolled loop lengths
MOVQ BX, R9
ANDQ $3, R9
SHRQ $2, BX
loop1:
TESTQ R9, R9; JZ loop1done
loop1cont:
// unroll 1X
MOVQ -16(SI), R10
SHLQ CX, R10, R8
MOVQ R8, -8(DI)
MOVQ R10, R8
LEAQ -8(SI), SI // ADD $-8, SI
LEAQ -8(DI), DI // ADD $-8, DI
SUBQ $1, R9; JNZ loop1cont
loop1done:
loop4:
TESTQ BX, BX; JZ loop4done
loop4cont:
// unroll 4X
MOVQ -16(SI), R9
MOVQ -24(SI), R10
MOVQ -32(SI), R11
MOVQ -40(SI), R12
SHLQ CX, R9, R8
SHLQ CX, R10, R9
SHLQ CX, R11, R10
SHLQ CX, R12, R11
MOVQ R8, -8(DI)
MOVQ R9, -16(DI)
MOVQ R10, -24(DI)
MOVQ R11, -32(DI)
MOVQ R12, R8
LEAQ -32(SI), SI // ADD $-32, SI
LEAQ -32(DI), DI // ADD $-32, DI
SUBQ $1, BX; JNZ loop4cont
loop4done:
// store final shifted bits
SHLQ CX, R8
MOVQ R8, -8(DI)
RET
X8b: MOVQ $0, c+56(FP)
ret0:
MOVQ $0, c+56(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB),NOSPLIT,$0
MOVQ z_len+8(FP), R11
SUBQ $1, R11 // n--
JL X9b // n < 0 (n <= 0)
// n > 0
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
TEXT ·rshVU(SB), NOSPLIT, $0
MOVQ z_len+8(FP), BX
TESTQ BX, BX; JZ ret0
MOVQ s+48(FP), CX
MOVQ (R8), AX // w1 = x[0]
MOVQ $0, DX
SHRQ CX, AX, DX // w1<<ŝ
MOVQ DX, c+56(FP)
MOVQ $0, BX // i = 0
JMP E9
// i < n-1
L9: MOVQ AX, DX // w = w1
MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
SHRQ CX, AX, DX // w>>s | w1<<ŝ
MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
ADDQ $1, BX // i++
E9: CMPQ BX, R11
JL L9 // i < n-1
// i >= n-1
X9a: SHRQ CX, AX // w1>>s
MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
MOVQ x_base+24(FP), SI
MOVQ z_base+0(FP), DI
// shift first word into carry
MOVQ 0(SI), R8
MOVQ $0, R9
SHRQ CX, R8, R9
MOVQ R9, c+56(FP)
// shift remaining words
SUBQ $1, BX
// compute unrolled loop lengths
MOVQ BX, R9
ANDQ $3, R9
SHRQ $2, BX
loop1:
TESTQ R9, R9; JZ loop1done
loop1cont:
// unroll 1X
MOVQ 8(SI), R10
SHRQ CX, R10, R8
MOVQ R8, 0(DI)
MOVQ R10, R8
LEAQ 8(SI), SI // ADD $8, SI
LEAQ 8(DI), DI // ADD $8, DI
SUBQ $1, R9; JNZ loop1cont
loop1done:
loop4:
TESTQ BX, BX; JZ loop4done
loop4cont:
// unroll 4X
MOVQ 8(SI), R9
MOVQ 16(SI), R10
MOVQ 24(SI), R11
MOVQ 32(SI), R12
SHRQ CX, R9, R8
SHRQ CX, R10, R9
SHRQ CX, R11, R10
SHRQ CX, R12, R11
MOVQ R8, 0(DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
MOVQ R11, 24(DI)
MOVQ R12, R8
LEAQ 32(SI), SI // ADD $32, SI
LEAQ 32(DI), DI // ADD $32, DI
SUBQ $1, BX; JNZ loop4cont
loop4done:
// store final shifted bits
SHRQ CX, R8
MOVQ R8, 0(DI)
RET
X9b: MOVQ $0, c+56(FP)
ret0:
MOVQ $0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ m+48(FP), R9
MOVQ a+56(FP), CX // c = a
MOVQ z_len+8(FP), R11
MOVQ $0, BX // i = 0
CMPQ R11, $4
JL E5
U5: // i+4 <= n
// regular loop body unrolled 4x
MOVQ (0*8)(R8)(BX*8), AX
MULQ R9
ADDQ CX, AX
ADCQ $0, DX
MOVQ AX, (0*8)(R10)(BX*8)
MOVQ DX, CX
MOVQ (1*8)(R8)(BX*8), AX
MULQ R9
ADDQ CX, AX
ADCQ $0, DX
MOVQ AX, (1*8)(R10)(BX*8)
MOVQ DX, CX
MOVQ (2*8)(R8)(BX*8), AX
MULQ R9
ADDQ CX, AX
ADCQ $0, DX
MOVQ AX, (2*8)(R10)(BX*8)
MOVQ DX, CX
MOVQ (3*8)(R8)(BX*8), AX
MULQ R9
ADDQ CX, AX
ADCQ $0, DX
MOVQ AX, (3*8)(R10)(BX*8)
MOVQ DX, CX
ADDQ $4, BX // i += 4
LEAQ 4(BX), DX
CMPQ DX, R11
JLE U5
JMP E5
L5: MOVQ (R8)(BX*8), AX
MULQ R9
ADDQ CX, AX
ADCQ $0, DX
MOVQ AX, (R10)(BX*8)
MOVQ DX, CX
ADDQ $1, BX // i++
E5: CMPQ BX, R11 // i < n
JL L5
MOVQ CX, c+64(FP)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVQ m+48(FP), BX
MOVQ a+56(FP), SI
MOVQ z_len+8(FP), DI
MOVQ x_base+24(FP), R8
MOVQ z_base+0(FP), R9
// compute unrolled loop lengths
MOVQ DI, R10
ANDQ $3, R10
SHRQ $2, DI
loop1:
TESTQ R10, R10; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
MOVQ 0(R8), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
MOVQ AX, 0(R9)
LEAQ 8(R8), R8 // ADD $8, R8
LEAQ 8(R9), R9 // ADD $8, R9
SUBQ $1, R10; JNZ loop1cont
loop1done:
loop4:
TESTQ DI, DI; JZ loop4done
loop4cont:
// unroll 4X in batches of 1
MOVQ 0(R8), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
MOVQ AX, 0(R9)
MOVQ 8(R8), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
MOVQ AX, 8(R9)
MOVQ 16(R8), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
MOVQ AX, 16(R9)
MOVQ 24(R8), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
MOVQ AX, 24(R9)
LEAQ 32(R8), R8 // ADD $32, R8
LEAQ 32(R9), R9 // ADD $32, R9
SUBQ $1, DI; JNZ loop4cont
loop4done:
MOVQ SI, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
CMPB ·support_adx(SB), $1
JEQ adx
MOVQ z+0(FP), R14
MOVQ x+24(FP), R10
MOVQ y+48(FP), R8
MOVQ m+72(FP), R9
MOVQ z_len+8(FP), R11
MOVQ $0, BX // i = 0
MOVQ a+80(FP), CX // c = 0
MOVQ R11, R12
ANDQ $-2, R12
CMPQ R11, $2
JAE A6
JMP E6
A6:
MOVQ (R8)(BX*8), AX
MULQ R9
ADDQ (R10)(BX*8), AX
ADCQ $0, DX
ADDQ CX, AX
ADCQ $0, DX
MOVQ DX, CX
MOVQ AX, (R14)(BX*8)
MOVQ (8)(R8)(BX*8), AX
MULQ R9
ADDQ (8)(R10)(BX*8), AX
ADCQ $0, DX
ADDQ CX, AX
ADCQ $0, DX
MOVQ DX, CX
MOVQ AX, (8)(R14)(BX*8)
ADDQ $2, BX
CMPQ BX, R12
JL A6
JMP E6
L6: MOVQ (R8)(BX*8), AX
MULQ R9
ADDQ CX, AX
ADCQ $0, DX
ADDQ (R10)(BX*8), AX
MOVQ AX, (R14)(BX*8)
ADCQ $0, DX
MOVQ DX, CX
ADDQ $1, BX // i++
E6: CMPQ BX, R11 // i < n
JL L6
MOVQ CX, c+88(FP)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
CMPB ·hasADX(SB), $0; JNZ altcarry
MOVQ m+72(FP), BX
MOVQ a+80(FP), SI
MOVQ z_len+8(FP), DI
MOVQ x_base+24(FP), R8
MOVQ y_base+48(FP), R9
MOVQ z_base+0(FP), R10
// compute unrolled loop lengths
MOVQ DI, R11
ANDQ $3, R11
SHRQ $2, DI
loop1:
TESTQ R11, R11; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
MOVQ 0(R9), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
// add
ADDQ 0(R8), AX
ADCQ $0, SI
MOVQ AX, 0(R10)
LEAQ 8(R8), R8 // ADD $8, R8
LEAQ 8(R9), R9 // ADD $8, R9
LEAQ 8(R10), R10 // ADD $8, R10
SUBQ $1, R11; JNZ loop1cont
loop1done:
loop4:
TESTQ DI, DI; JZ loop4done
loop4cont:
// unroll 4X in batches of 1
MOVQ 0(R9), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
// add
ADDQ 0(R8), AX
ADCQ $0, SI
MOVQ AX, 0(R10)
MOVQ 8(R9), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
// add
ADDQ 8(R8), AX
ADCQ $0, SI
MOVQ AX, 8(R10)
MOVQ 16(R9), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
// add
ADDQ 16(R8), AX
ADCQ $0, SI
MOVQ AX, 16(R10)
MOVQ 24(R9), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
// add
ADDQ 24(R8), AX
ADCQ $0, SI
MOVQ AX, 24(R10)
LEAQ 32(R8), R8 // ADD $32, R8
LEAQ 32(R9), R9 // ADD $32, R9
LEAQ 32(R10), R10 // ADD $32, R10
SUBQ $1, DI; JNZ loop4cont
loop4done:
MOVQ SI, c+88(FP)
RET
adx:
MOVQ z_len+8(FP), R11
MOVQ z+0(FP), R14
MOVQ x+24(FP), R10
MOVQ y+48(FP), R8
altcarry:
MOVQ m+72(FP), DX
MOVQ $0, BX // i = 0
MOVQ a+80(FP), CX // carry
CMPQ R11, $8
JAE adx_loop_header
CMPQ BX, R11
JL adx_short
MOVQ CX, c+88(FP)
MOVQ a+80(FP), BX
MOVQ z_len+8(FP), SI
MOVQ $0, DI
MOVQ x_base+24(FP), R8
MOVQ y_base+48(FP), R9
MOVQ z_base+0(FP), R10
// compute unrolled loop lengths
MOVQ SI, R11
ANDQ $7, R11
SHRQ $3, SI
alt1:
TESTQ R11, R11; JZ alt1done
alt1cont:
// unroll 1X
// multiply and add
TESTQ AX, AX // clear carry
TESTQ AX, AX // clear carry
MULXQ 0(R9), R13, R12
ADCXQ BX, R13
ADOXQ 0(R8), R13
MOVQ R13, 0(R10)
MOVQ R12, BX
ADCXQ DI, BX
ADOXQ DI, BX
LEAQ 8(R8), R8 // ADD $8, R8
LEAQ 8(R9), R9 // ADD $8, R9
LEAQ 8(R10), R10 // ADD $8, R10
SUBQ $1, R11; JNZ alt1cont
alt1done:
alt8:
TESTQ SI, SI; JZ alt8done
alt8cont:
// unroll 8X in batches of 2
// multiply and add
TESTQ AX, AX // clear carry
TESTQ AX, AX // clear carry
MULXQ 0(R9), R13, R11
ADCXQ BX, R13
ADOXQ 0(R8), R13
MULXQ 8(R9), R14, BX
ADCXQ R11, R14
ADOXQ 8(R8), R14
MOVQ R13, 0(R10)
MOVQ R14, 8(R10)
MULXQ 16(R9), R13, R11
ADCXQ BX, R13
ADOXQ 16(R8), R13
MULXQ 24(R9), R14, BX
ADCXQ R11, R14
ADOXQ 24(R8), R14
MOVQ R13, 16(R10)
MOVQ R14, 24(R10)
MULXQ 32(R9), R13, R11
ADCXQ BX, R13
ADOXQ 32(R8), R13
MULXQ 40(R9), R14, BX
ADCXQ R11, R14
ADOXQ 40(R8), R14
MOVQ R13, 32(R10)
MOVQ R14, 40(R10)
MULXQ 48(R9), R13, R11
ADCXQ BX, R13
ADOXQ 48(R8), R13
MULXQ 56(R9), R14, BX
ADCXQ R11, R14
ADOXQ 56(R8), R14
MOVQ R13, 48(R10)
MOVQ R14, 56(R10)
ADCXQ DI, BX
ADOXQ DI, BX
LEAQ 64(R8), R8 // ADD $64, R8
LEAQ 64(R9), R9 // ADD $64, R9
LEAQ 64(R10), R10 // ADD $64, R10
SUBQ $1, SI; JNZ alt8cont
alt8done:
MOVQ BX, c+88(FP)
RET
adx_loop_header:
MOVQ R11, R13
ANDQ $-8, R13
adx_loop:
XORQ R9, R9 // unset flags
MULXQ (R8), SI, DI
ADCXQ CX,SI
ADOXQ (R10), SI
MOVQ SI,(R14)
MULXQ 8(R8), AX, CX
ADCXQ DI, AX
ADOXQ 8(R10), AX
MOVQ AX, 8(R14)
MULXQ 16(R8), SI, DI
ADCXQ CX, SI
ADOXQ 16(R10), SI
MOVQ SI, 16(R14)
MULXQ 24(R8), AX, CX
ADCXQ DI, AX
ADOXQ 24(R10), AX
MOVQ AX, 24(R14)
MULXQ 32(R8), SI, DI
ADCXQ CX, SI
ADOXQ 32(R10), SI
MOVQ SI, 32(R14)
MULXQ 40(R8), AX, CX
ADCXQ DI, AX
ADOXQ 40(R10), AX
MOVQ AX, 40(R14)
MULXQ 48(R8), SI, DI
ADCXQ CX, SI
ADOXQ 48(R10), SI
MOVQ SI, 48(R14)
MULXQ 56(R8), AX, CX
ADCXQ DI, AX
ADOXQ 56(R10), AX
MOVQ AX, 56(R14)
ADCXQ R9, CX
ADOXQ R9, CX
ADDQ $64, R8
ADDQ $64, R10
ADDQ $64, R14
ADDQ $8, BX
CMPQ BX, R13
JL adx_loop
MOVQ z+0(FP), R14
MOVQ x+24(FP), R10
MOVQ y+48(FP), R8
CMPQ BX, R11
JL adx_short
MOVQ CX, c+88(FP)
RET
adx_short:
MULXQ (R8)(BX*8), SI, DI
ADDQ CX, SI
ADCQ $0, DI
ADDQ (R10)(BX*8), SI
MOVQ SI, (R14)(BX*8)
ADCQ $0, DI
MOVQ DI, CX
ADDQ $1, BX // i++
CMPQ BX, R11
JL adx_short
MOVQ CX, c+88(FP)
RET

View File

@ -0,0 +1,14 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !math_big_pure_go
package big
import "testing"
func TestAddMulVVWWNoADX(t *testing.T) {
setDuringTest(t, &hasADX, false)
TestAddMulVVWW(t)
}

View File

@ -1,197 +1,355 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
ADD.S $0, R0 // clear carry flag
MOVW z+0(FP), R1
MOVW z_len+4(FP), R4
MOVW x+12(FP), R2
MOVW y+24(FP), R3
ADD R4<<2, R1, R4
B E1
L1:
MOVW.P 4(R2), R5
MOVW.P 4(R3), R6
ADC.S R6, R5
MOVW.P R5, 4(R1)
E1:
TEQ R1, R4
BNE L1
MOVW $0, R0
MOVW.CS $1, R0
MOVW R0, c+36(FP)
TEXT ·addVV(SB), NOSPLIT, $0
MOVW z_len+4(FP), R0
MOVW x_base+12(FP), R1
MOVW y_base+24(FP), R2
MOVW z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
MOVW R0>>2, R0
ADD.S $0, R0 // clear carry
loop1:
TEQ $0, R4; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R1), R5
MOVW.P 4(R2), R6
ADC.S R6, R5
MOVW.P R5, 4(R3)
SUB $1, R4
TEQ $0, R4; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.P 4(R1), R4
MOVW.P 4(R1), R5
MOVW.P 4(R1), R6
MOVW.P 4(R1), R7
MOVW.P 4(R2), R8
MOVW.P 4(R2), R9
MOVW.P 4(R2), R11
MOVW.P 4(R2), R12
ADC.S R8, R4
ADC.S R9, R5
ADC.S R11, R6
ADC.S R12, R7
MOVW.P R4, 4(R3)
MOVW.P R5, 4(R3)
MOVW.P R6, 4(R3)
MOVW.P R7, 4(R3)
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
SBC R1, R1 // save carry
ADD $1, R1 // convert add carry
MOVW R1, c+36(FP)
RET
// func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SBC instead of ADC and label names)
TEXT ·subVV(SB),NOSPLIT,$0
SUB.S $0, R0 // clear borrow flag
MOVW z+0(FP), R1
MOVW z_len+4(FP), R4
MOVW x+12(FP), R2
MOVW y+24(FP), R3
ADD R4<<2, R1, R4
B E2
L2:
MOVW.P 4(R2), R5
MOVW.P 4(R3), R6
SBC.S R6, R5
MOVW.P R5, 4(R1)
E2:
TEQ R1, R4
BNE L2
MOVW $0, R0
MOVW.CC $1, R0
MOVW R0, c+36(FP)
TEXT ·subVV(SB), NOSPLIT, $0
MOVW z_len+4(FP), R0
MOVW x_base+12(FP), R1
MOVW y_base+24(FP), R2
MOVW z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
MOVW R0>>2, R0
SUB.S $0, R0 // clear carry
loop1:
TEQ $0, R4; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R1), R5
MOVW.P 4(R2), R6
SBC.S R6, R5
MOVW.P R5, 4(R3)
SUB $1, R4
TEQ $0, R4; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.P 4(R1), R4
MOVW.P 4(R1), R5
MOVW.P 4(R1), R6
MOVW.P 4(R1), R7
MOVW.P 4(R2), R8
MOVW.P 4(R2), R9
MOVW.P 4(R2), R11
MOVW.P 4(R2), R12
SBC.S R8, R4
SBC.S R9, R5
SBC.S R11, R6
SBC.S R12, R7
MOVW.P R4, 4(R3)
MOVW.P R5, 4(R3)
MOVW.P R6, 4(R3)
MOVW.P R7, 4(R3)
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
SBC R1, R1 // save carry
RSB $0, R1, R1 // convert sub carry
MOVW R1, c+36(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVW z_len+4(FP), R5
TEQ $0, R5
BEQ X7
MOVW z+0(FP), R1
MOVW x+12(FP), R2
ADD R5<<2, R2, R2
ADD R5<<2, R1, R5
MOVW s+24(FP), R3
ADD $4, R1 // stop one word early
MOVW $32, R4
SUB R3, R4
MOVW $0, R7
MOVW.W -4(R2), R6
MOVW R6<<R3, R7
MOVW R6>>R4, R6
MOVW R6, c+28(FP)
B E7
L7:
MOVW.W -4(R2), R6
ORR R6>>R4, R7
MOVW.W R7, -4(R5)
MOVW R6<<R3, R7
E7:
TEQ R1, R5
BNE L7
MOVW R7, -4(R5)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVW z_len+4(FP), R0
TEQ $0, R0; BEQ ret0
MOVW s+24(FP), R1
MOVW x_base+12(FP), R2
MOVW z_base+0(FP), R3
// run loop backward
ADD R0<<2, R2, R2
ADD R0<<2, R3, R3
// shift first word into carry
MOVW.W -4(R2), R4
MOVW $32, R5
SUB R1, R5
MOVW R4>>R5, R6
MOVW R4<<R1, R4
MOVW R6, c+28(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
MOVW R0>>2, R0
loop1:
TEQ $0, R6; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.W -4(R2), R7
ORR R7>>R5, R4
MOVW.W R4, -4(R3)
MOVW R7<<R1, R4
SUB $1, R6
TEQ $0, R6; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.W -4(R2), R6
MOVW.W -4(R2), R7
MOVW.W -4(R2), R8
MOVW.W -4(R2), R9
ORR R6>>R5, R4
MOVW.W R4, -4(R3)
MOVW R6<<R1, R4
ORR R7>>R5, R4
MOVW.W R4, -4(R3)
MOVW R7<<R1, R4
ORR R8>>R5, R4
MOVW.W R4, -4(R3)
MOVW R8<<R1, R4
ORR R9>>R5, R4
MOVW.W R4, -4(R3)
MOVW R9<<R1, R4
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
// store final shifted bits
MOVW.W R4, -4(R3)
RET
X7:
MOVW $0, R1
MOVW R1, c+28(FP)
ret0:
MOVW $0, R1
MOVW R1, c+28(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB),NOSPLIT,$0
MOVW z_len+4(FP), R5
TEQ $0, R5
BEQ X6
MOVW z+0(FP), R1
MOVW x+12(FP), R2
ADD R5<<2, R1, R5
MOVW s+24(FP), R3
SUB $4, R5 // stop one word early
MOVW $32, R4
SUB R3, R4
MOVW $0, R7
// first word
MOVW.P 4(R2), R6
MOVW R6>>R3, R7
MOVW R6<<R4, R6
MOVW R6, c+28(FP)
B E6
// word loop
L6:
MOVW.P 4(R2), R6
ORR R6<<R4, R7
MOVW.P R7, 4(R1)
MOVW R6>>R3, R7
E6:
TEQ R1, R5
BNE L6
MOVW R7, 0(R1)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVW z_len+4(FP), R0
TEQ $0, R0; BEQ ret0
MOVW s+24(FP), R1
MOVW x_base+12(FP), R2
MOVW z_base+0(FP), R3
// shift first word into carry
MOVW.P 4(R2), R4
MOVW $32, R5
SUB R1, R5
MOVW R4<<R5, R6
MOVW R4>>R1, R4
MOVW R6, c+28(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
MOVW R0>>2, R0
loop1:
TEQ $0, R6; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R2), R7
ORR R7<<R5, R4
MOVW.P R4, 4(R3)
MOVW R7>>R1, R4
SUB $1, R6
TEQ $0, R6; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.P 4(R2), R6
MOVW.P 4(R2), R7
MOVW.P 4(R2), R8
MOVW.P 4(R2), R9
ORR R6<<R5, R4
MOVW.P R4, 4(R3)
MOVW R6>>R1, R4
ORR R7<<R5, R4
MOVW.P R4, 4(R3)
MOVW R7>>R1, R4
ORR R8<<R5, R4
MOVW.P R4, 4(R3)
MOVW R8>>R1, R4
ORR R9<<R5, R4
MOVW.P R4, 4(R3)
MOVW R9>>R1, R4
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
// store final shifted bits
MOVW.P R4, 4(R3)
RET
X6:
MOVW $0, R1
MOVW R1, c+28(FP)
ret0:
MOVW $0, R1
MOVW R1, c+28(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVW $0, R0
MOVW z+0(FP), R1
MOVW z_len+4(FP), R5
MOVW x+12(FP), R2
MOVW m+24(FP), R3
MOVW a+28(FP), R4
ADD R5<<2, R1, R5
B E8
// word loop
L8:
MOVW.P 4(R2), R6
MULLU R6, R3, (R7, R6)
ADD.S R4, R6
ADC R0, R7
MOVW.P R6, 4(R1)
MOVW R7, R4
E8:
TEQ R1, R5
BNE L8
MOVW R4, c+32(FP)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVW m+24(FP), R0
MOVW a+28(FP), R1
MOVW z_len+4(FP), R2
MOVW x_base+12(FP), R3
MOVW z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R2, R5
MOVW R2>>2, R2
loop1:
TEQ $0, R5; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R3), R6
// multiply
MULLU R0, R6, (R7, R6)
ADD.S R1, R6
ADC $0, R7, R1
MOVW.P R6, 4(R4)
SUB $1, R5
TEQ $0, R5; BNE loop1cont
loop1done:
loop4:
TEQ $0, R2; BEQ loop4done
loop4cont:
// unroll 4X in batches of 2
MOVW.P 4(R3), R5
MOVW.P 4(R3), R6
// multiply
MULLU R0, R5, (R7, R5)
ADD.S R1, R5
MULLU R0, R6, (R8, R6)
ADC.S R7, R6
ADC $0, R8, R1
MOVW.P R5, 4(R4)
MOVW.P R6, 4(R4)
MOVW.P 4(R3), R5
MOVW.P 4(R3), R6
// multiply
MULLU R0, R5, (R7, R5)
ADD.S R1, R5
MULLU R0, R6, (R8, R6)
ADC.S R7, R6
ADC $0, R8, R1
MOVW.P R5, 4(R4)
MOVW.P R6, 4(R4)
SUB $1, R2
TEQ $0, R2; BNE loop4cont
loop4done:
MOVW R1, c+32(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
MOVW $0, R0
MOVW z+0(FP), R9
MOVW x+12(FP), R1
MOVW z_len+4(FP), R5
MOVW y+24(FP), R2
MOVW m+36(FP), R3
ADD R5<<2, R1, R5
MOVW a+40(FP), R4
B E9
// word loop
L9:
MOVW.P 4(R2), R6
MULLU R6, R3, (R7, R6)
ADD.S R4, R6
ADC R0, R7
MOVW.P 4(R1), R4
ADD.S R4, R6
ADC R0, R7
MOVW.P R6, 4(R9)
MOVW R7, R4
E9:
TEQ R1, R5
BNE L9
MOVW R4, c+44(FP)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVW m+36(FP), R0
MOVW a+40(FP), R1
MOVW z_len+4(FP), R2
MOVW x_base+12(FP), R3
MOVW y_base+24(FP), R4
MOVW z_base+0(FP), R5
// compute unrolled loop lengths
AND $3, R2, R6
MOVW R2>>2, R2
loop1:
TEQ $0, R6; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R3), R7
MOVW.P 4(R4), R8
// multiply
MULLU R0, R8, (R9, R8)
ADD.S R1, R8
ADC $0, R9, R1
// add
ADD.S R7, R8
ADC $0, R1
MOVW.P R8, 4(R5)
SUB $1, R6
TEQ $0, R6; BNE loop1cont
loop1done:
loop4:
TEQ $0, R2; BEQ loop4done
loop4cont:
// unroll 4X in batches of 2
MOVW.P 4(R3), R6
MOVW.P 4(R3), R7
MOVW.P 4(R4), R8
MOVW.P 4(R4), R9
// multiply
MULLU R0, R8, (R11, R8)
ADD.S R1, R8
MULLU R0, R9, (R12, R9)
ADC.S R11, R9
ADC $0, R12, R1
// add
ADD.S R6, R8
ADC.S R7, R9
ADC $0, R1
MOVW.P R8, 4(R5)
MOVW.P R9, 4(R5)
MOVW.P 4(R3), R6
MOVW.P 4(R3), R7
MOVW.P 4(R4), R8
MOVW.P 4(R4), R9
// multiply
MULLU R0, R8, (R11, R8)
ADD.S R1, R8
MULLU R0, R9, (R12, R9)
ADC.S R11, R9
ADC $0, R12, R1
// add
ADD.S R6, R8
ADC.S R7, R9
ADC $0, R1
MOVW.P R8, 4(R5)
MOVW.P R9, 4(R5)
SUB $1, R2
TEQ $0, R2; BNE loop4cont
loop4done:
MOVW R1, c+44(FP)
RET

View File

@ -1,375 +1,374 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// TODO: Consider re-implementing using Advanced SIMD
// once the assembler supports those instructions.
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
MOVD z_len+8(FP), R0
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R10
ADDS $0, R0 // clear carry flag
TBZ $0, R0, two
MOVD.P 8(R8), R11
MOVD.P 8(R9), R15
ADCS R15, R11
MOVD.P R11, 8(R10)
SUB $1, R0
two:
TBZ $1, R0, loop
LDP.P 16(R8), (R11, R12)
LDP.P 16(R9), (R15, R16)
ADCS R15, R11
ADCS R16, R12
STP.P (R11, R12), 16(R10)
SUB $2, R0
loop:
CBZ R0, done // careful not to touch the carry flag
LDP.P 32(R8), (R11, R12)
LDP -16(R8), (R13, R14)
LDP.P 32(R9), (R15, R16)
LDP -16(R9), (R17, R19)
ADCS R15, R11
ADCS R16, R12
ADCS R17, R13
ADCS R19, R14
STP.P (R11, R12), 32(R10)
STP (R13, R14), -16(R10)
SUB $4, R0
B loop
done:
CSET HS, R0 // extract carry flag
MOVD R0, c+72(FP)
TEXT ·addVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
MOVD x_base+24(FP), R1
MOVD y_base+48(FP), R2
MOVD z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
LSR $2, R0
ADDS ZR, R0 // clear carry
loop1:
CBZ R4, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R1), R5
MOVD.P 8(R2), R6
ADCS R6, R5
MOVD.P R5, 8(R3)
SUB $1, R4
CBNZ R4, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R1), (R4, R5)
LDP -16(R1), (R6, R7)
LDP.P 32(R2), (R8, R9)
LDP -16(R2), (R10, R11)
ADCS R8, R4
ADCS R9, R5
ADCS R10, R6
ADCS R11, R7
STP.P (R4, R5), 32(R3)
STP (R6, R7), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
ADC ZR, ZR, R1 // save & convert add carry
MOVD R1, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB),NOSPLIT,$0
MOVD z_len+8(FP), R0
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R10
CMP R0, R0 // set carry flag
TBZ $0, R0, two
MOVD.P 8(R8), R11
MOVD.P 8(R9), R15
SBCS R15, R11
MOVD.P R11, 8(R10)
SUB $1, R0
two:
TBZ $1, R0, loop
LDP.P 16(R8), (R11, R12)
LDP.P 16(R9), (R15, R16)
SBCS R15, R11
SBCS R16, R12
STP.P (R11, R12), 16(R10)
SUB $2, R0
loop:
CBZ R0, done // careful not to touch the carry flag
LDP.P 32(R8), (R11, R12)
LDP -16(R8), (R13, R14)
LDP.P 32(R9), (R15, R16)
LDP -16(R9), (R17, R19)
SBCS R15, R11
SBCS R16, R12
SBCS R17, R13
SBCS R19, R14
STP.P (R11, R12), 32(R10)
STP (R13, R14), -16(R10)
SUB $4, R0
B loop
done:
CSET LO, R0 // extract carry flag
MOVD R0, c+72(FP)
TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
MOVD x_base+24(FP), R1
MOVD y_base+48(FP), R2
MOVD z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
LSR $2, R0
SUBS ZR, R0 // clear carry
loop1:
CBZ R4, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R1), R5
MOVD.P 8(R2), R6
SBCS R6, R5
MOVD.P R5, 8(R3)
SUB $1, R4
CBNZ R4, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R1), (R4, R5)
LDP -16(R1), (R6, R7)
LDP.P 32(R2), (R8, R9)
LDP -16(R2), (R10, R11)
SBCS R8, R4
SBCS R9, R5
SBCS R10, R6
SBCS R11, R7
STP.P (R4, R5), 32(R3)
STP (R6, R7), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
SBC R1, R1 // save carry
SUB R1, ZR, R1 // convert sub carry
MOVD R1, c+72(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
// This implementation handles the shift operation from the high word to the low word,
// which may be an error for the case where the low word of x overlaps with the high
// word of z. When calling this function directly, you need to pay attention to this
// situation.
TEXT ·lshVU(SB),NOSPLIT,$0
LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z)
MOVD x+24(FP), R2
MOVD s+48(FP), R3
ADD R1<<3, R0 // R0 = &z[n]
ADD R1<<3, R2 // R2 = &x[n]
CBZ R1, len0
MOVD $64, R4
SUB R3, R4
// handling the most significant element x[n-1]
MOVD.W -8(R2), R6
LSR R4, R6, R5 // return value
LSL R3, R6, R8 // x[i] << s
SUB $1, R1
one: TBZ $0, R1, two
MOVD.W -8(R2), R6
LSR R4, R6, R7
ORR R8, R7
LSL R3, R6, R8
SUB $1, R1
MOVD.W R7, -8(R0)
two:
TBZ $1, R1, loop
LDP.W -16(R2), (R6, R7)
LSR R4, R7, R10
ORR R8, R10
LSL R3, R7
LSR R4, R6, R9
ORR R7, R9
LSL R3, R6, R8
SUB $2, R1
STP.W (R9, R10), -16(R0)
loop:
CBZ R1, done
LDP.W -32(R2), (R10, R11)
LDP 16(R2), (R12, R13)
LSR R4, R13, R23
ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
LSL R3, R13
LSR R4, R12, R22
ORR R13, R22
LSL R3, R12
LSR R4, R11, R21
ORR R12, R21
LSL R3, R11
LSR R4, R10, R20
ORR R11, R20
LSL R3, R10, R8
STP.W (R20, R21), -32(R0)
STP (R22, R23), 16(R0)
SUB $4, R1
B loop
done:
MOVD.W R8, -8(R0) // the first element x[0]
MOVD R5, c+56(FP) // the part moved out from x[n-1]
TEXT ·lshVU(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
CBZ R0, ret0
MOVD s+48(FP), R1
MOVD x_base+24(FP), R2
MOVD z_base+0(FP), R3
// run loop backward
ADD R0<<3, R2, R2
ADD R0<<3, R3, R3
// shift first word into carry
MOVD.W -8(R2), R4
MOVD $64, R5
SUB R1, R5
LSR R5, R4, R6
LSL R1, R4
MOVD R6, c+56(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
LSR $2, R0
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.W -8(R2), R7
LSR R5, R7, R8
ORR R4, R8
LSL R1, R7, R4
MOVD.W R8, -8(R3)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.W -32(R2), (R9, R8)
LDP 16(R2), (R7, R6)
LSR R5, R6, R10
ORR R4, R10
LSL R1, R6, R4
LSR R5, R7, R6
ORR R4, R6
LSL R1, R7, R4
LSR R5, R8, R7
ORR R4, R7
LSL R1, R8, R4
LSR R5, R9, R8
ORR R4, R8
LSL R1, R9, R4
STP.W (R8, R7), -32(R3)
STP (R6, R10), 16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
// store final shifted bits
MOVD.W R4, -8(R3)
RET
len0:
MOVD $0, c+56(FP)
ret0:
MOVD ZR, c+56(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
// This implementation handles the shift operation from the low word to the high word,
// which may be an error for the case where the high word of x overlaps with the low
// word of z. When calling this function directly, you need to pay attention to this
// situation.
TEXT ·rshVU(SB),NOSPLIT,$0
MOVD z+0(FP), R0
MOVD z_len+8(FP), R1
MOVD x+24(FP), R2
MOVD s+48(FP), R3
MOVD $0, R8
MOVD $64, R4
SUB R3, R4
CBZ R1, len0
MOVD.P 8(R2), R20
LSR R3, R20, R8
LSL R4, R20
MOVD R20, c+56(FP) // deal with the first element
SUB $1, R1
TBZ $0, R1, two
MOVD.P 8(R2), R6
LSL R4, R6, R20
ORR R8, R20
LSR R3, R6, R8
MOVD.P R20, 8(R0)
SUB $1, R1
two:
TBZ $1, R1, loop
LDP.P 16(R2), (R6, R7)
LSL R4, R6, R20
LSR R3, R6
ORR R8, R20
LSL R4, R7, R21
LSR R3, R7, R8
ORR R6, R21
STP.P (R20, R21), 16(R0)
SUB $2, R1
loop:
CBZ R1, done
LDP.P 32(R2), (R10, R11)
LDP -16(R2), (R12, R13)
LSL R4, R10, R20
LSR R3, R10
ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
LSL R4, R11, R21
LSR R3, R11
ORR R10, R21
LSL R4, R12, R22
LSR R3, R12
ORR R11, R22
LSL R4, R13, R23
LSR R3, R13, R8
ORR R12, R23
STP.P (R20, R21), 32(R0)
STP (R22, R23), -16(R0)
SUB $4, R1
B loop
done:
MOVD R8, (R0) // deal with the last element
TEXT ·rshVU(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
CBZ R0, ret0
MOVD s+48(FP), R1
MOVD x_base+24(FP), R2
MOVD z_base+0(FP), R3
// shift first word into carry
MOVD.P 8(R2), R4
MOVD $64, R5
SUB R1, R5
LSL R5, R4, R6
LSR R1, R4
MOVD R6, c+56(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
LSR $2, R0
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R2), R7
LSL R5, R7, R8
ORR R4, R8
LSR R1, R7, R4
MOVD.P R8, 8(R3)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R2), (R6, R7)
LDP -16(R2), (R8, R9)
LSL R5, R6, R10
ORR R4, R10
LSR R1, R6, R4
LSL R5, R7, R6
ORR R4, R6
LSR R1, R7, R4
LSL R5, R8, R7
ORR R4, R7
LSR R1, R8, R4
LSL R5, R9, R8
ORR R4, R8
LSR R1, R9, R4
STP.P (R10, R6), 32(R3)
STP (R7, R8), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
// store final shifted bits
MOVD.P R4, 8(R3)
RET
len0:
MOVD $0, c+56(FP)
ret0:
MOVD ZR, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVD z+0(FP), R1
MOVD z_len+8(FP), R0
MOVD x+24(FP), R2
MOVD m+48(FP), R3
MOVD a+56(FP), R4
// c, z = x * y + r
TBZ $0, R0, two
MOVD.P 8(R2), R5
MUL R3, R5, R7
UMULH R3, R5, R8
ADDS R4, R7
ADC $0, R8, R4 // c, z[i] = x[i] * y + r
MOVD.P R7, 8(R1)
SUB $1, R0
two:
TBZ $1, R0, loop
LDP.P 16(R2), (R5, R6)
MUL R3, R5, R10
UMULH R3, R5, R11
ADDS R4, R10
MUL R3, R6, R12
UMULH R3, R6, R13
ADCS R12, R11
ADC $0, R13, R4
STP.P (R10, R11), 16(R1)
SUB $2, R0
loop:
CBZ R0, done
LDP.P 32(R2), (R5, R6)
LDP -16(R2), (R7, R8)
MUL R3, R5, R10
UMULH R3, R5, R11
ADDS R4, R10
MUL R3, R6, R12
UMULH R3, R6, R13
ADCS R11, R12
MUL R3, R7, R14
UMULH R3, R7, R15
ADCS R13, R14
MUL R3, R8, R16
UMULH R3, R8, R17
ADCS R15, R16
ADC $0, R17, R4
STP.P (R10, R12), 32(R1)
STP (R14, R16), -16(R1)
SUB $4, R0
B loop
done:
MOVD R4, c+64(FP)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD m+48(FP), R0
MOVD a+56(FP), R1
MOVD z_len+8(FP), R2
MOVD x_base+24(FP), R3
MOVD z_base+0(FP), R4
// compute unrolled loop lengths
AND $7, R2, R5
LSR $3, R2
loop1:
CBZ R5, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R3), R6
// multiply
UMULH R0, R6, R7
MUL R0, R6
ADDS R1, R6
ADC ZR, R7, R1
MOVD.P R6, 8(R4)
SUB $1, R5
CBNZ R5, loop1cont
loop1done:
loop8:
CBZ R2, loop8done
loop8cont:
// unroll 8X
LDP.P 64(R3), (R5, R6)
LDP -48(R3), (R7, R8)
LDP -32(R3), (R9, R10)
LDP -16(R3), (R11, R12)
// multiply
UMULH R0, R5, R13
MUL R0, R5
ADDS R1, R5
UMULH R0, R6, R14
MUL R0, R6
ADCS R13, R6
UMULH R0, R7, R13
MUL R0, R7
ADCS R14, R7
UMULH R0, R8, R14
MUL R0, R8
ADCS R13, R8
UMULH R0, R9, R13
MUL R0, R9
ADCS R14, R9
UMULH R0, R10, R14
MUL R0, R10
ADCS R13, R10
UMULH R0, R11, R13
MUL R0, R11
ADCS R14, R11
UMULH R0, R12, R14
MUL R0, R12
ADCS R13, R12
ADC ZR, R14, R1
STP.P (R5, R6), 64(R4)
STP (R7, R8), -48(R4)
STP (R9, R10), -32(R4)
STP (R11, R12), -16(R4)
SUB $1, R2
CBNZ R2, loop8cont
loop8done:
MOVD R1, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
MOVD z+0(FP), R22
MOVD x+24(FP), R1
MOVD z_len+8(FP), R0
MOVD y+48(FP), R2
MOVD m+72(FP), R3
MOVD a+80(FP), R4
TBZ $0, R0, two
MOVD.P 8(R2), R5
MOVD.P 8(R1), R6
MUL R5, R3, R7
UMULH R5, R3, R8
ADDS R4, R7
ADC $0, R8
ADDS R7, R6
ADC $0, R8, R4
MOVD.P R6, 8(R22)
SUB $1, R0
two:
TBZ $1, R0, loop
LDP.P 16(R2), (R5, R10)
LDP.P 16(R1), (R6, R11)
MUL R10, R3, R13
UMULH R10, R3, R12
MUL R5, R3, R7
UMULH R5, R3, R8
ADDS R4, R6
ADCS R13, R11
ADC $0, R12
ADDS R7, R6
ADCS R8, R11
ADC $0, R12, R4
STP.P (R6, R11), 16(R22)
SUB $2, R0
// The main loop of this code operates on a block of 4 words every iteration
// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
loop:
CBZ R0, done
LDP.P 16(R2), (R5, R6)
LDP.P 16(R2), (R7, R8)
LDP.P 16(R1), (R9, R10)
ADDS R4, R9
MUL R6, R3, R14
ADCS R14, R10
MUL R7, R3, R15
LDP.P 16(R1), (R11, R12)
ADCS R15, R11
MUL R8, R3, R16
ADCS R16, R12
UMULH R8, R3, R20
ADC $0, R20
MUL R5, R3, R13
ADDS R13, R9
UMULH R5, R3, R17
ADCS R17, R10
UMULH R6, R3, R21
STP.P (R9, R10), 16(R22)
ADCS R21, R11
UMULH R7, R3, R19
ADCS R19, R12
STP.P (R11, R12), 16(R22)
ADC $0, R20, R4
SUB $4, R0
B loop
done:
MOVD R4, c+88(FP)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVD m+72(FP), R0
MOVD a+80(FP), R1
MOVD z_len+8(FP), R2
MOVD x_base+24(FP), R3
MOVD y_base+48(FP), R4
MOVD z_base+0(FP), R5
// compute unrolled loop lengths
AND $7, R2, R6
LSR $3, R2
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R3), R7
MOVD.P 8(R4), R8
// multiply
UMULH R0, R8, R9
MUL R0, R8
ADDS R1, R8
ADC ZR, R9, R1
// add
ADDS R7, R8
ADC ZR, R1
MOVD.P R8, 8(R5)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop8:
CBZ R2, loop8done
loop8cont:
// unroll 8X
LDP.P 64(R3), (R6, R7)
LDP -48(R3), (R8, R9)
LDP -32(R3), (R10, R11)
LDP -16(R3), (R12, R13)
LDP.P 64(R4), (R14, R15)
LDP -48(R4), (R16, R17)
LDP -32(R4), (R19, R20)
LDP -16(R4), (R21, R22)
// multiply
UMULH R0, R14, R23
MUL R0, R14
ADDS R1, R14
UMULH R0, R15, R24
MUL R0, R15
ADCS R23, R15
UMULH R0, R16, R23
MUL R0, R16
ADCS R24, R16
UMULH R0, R17, R24
MUL R0, R17
ADCS R23, R17
UMULH R0, R19, R23
MUL R0, R19
ADCS R24, R19
UMULH R0, R20, R24
MUL R0, R20
ADCS R23, R20
UMULH R0, R21, R23
MUL R0, R21
ADCS R24, R21
UMULH R0, R22, R24
MUL R0, R22
ADCS R23, R22
ADC ZR, R24, R1
// add
ADDS R6, R14
ADCS R7, R15
ADCS R8, R16
ADCS R9, R17
ADCS R10, R19
ADCS R11, R20
ADCS R12, R21
ADCS R13, R22
ADC ZR, R1
STP.P (R14, R15), 64(R5)
STP (R16, R17), -48(R5)
STP (R19, R20), -32(R5)
STP (R21, R22), -16(R5)
SUB $1, R2
CBNZ R2, loop8cont
loop8done:
MOVD R1, c+88(FP)
RET

View File

@ -4,6 +4,8 @@
//go:build !math_big_pure_go
//go:generate go test ./internal/asmgen -generate
package big
import _ "unsafe" // for linkname

View File

@ -1,82 +1,457 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !math_big_pure_go && loong64
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
TEXT ·addVV(SB),NOSPLIT,$0
JMP ·addVV_g(SB)
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOVV z_len+8(FP), R4
MOVV x_base+24(FP), R5
MOVV y_base+48(FP), R6
MOVV z_base+0(FP), R7
// compute unrolled loop lengths
AND $3, R4, R8
SRLV $2, R4
XOR R28, R28 // clear carry
loop1:
BEQ R8, loop1done
loop1cont:
// unroll 1X
MOVV 0(R5), R9
MOVV 0(R6), R10
ADDVU R10, R9 // ADCS R10, R9, R9 (cr=R28)
SGTU R10, R9, R30 // ...
ADDVU R28, R9 // ...
SGTU R28, R9, R28 // ...
ADDVU R30, R28 // ...
MOVV R9, 0(R7)
ADDVU $8, R5
ADDVU $8, R6
ADDVU $8, R7
SUBVU $1, R8
BNE R8, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 0(R5), R8
MOVV 8(R5), R9
MOVV 16(R5), R10
MOVV 24(R5), R11
MOVV 0(R6), R12
MOVV 8(R6), R13
MOVV 16(R6), R14
MOVV 24(R6), R15
ADDVU R12, R8 // ADCS R12, R8, R8 (cr=R28)
SGTU R12, R8, R30 // ...
ADDVU R28, R8 // ...
SGTU R28, R8, R28 // ...
ADDVU R30, R28 // ...
ADDVU R13, R9 // ADCS R13, R9, R9 (cr=R28)
SGTU R13, R9, R30 // ...
ADDVU R28, R9 // ...
SGTU R28, R9, R28 // ...
ADDVU R30, R28 // ...
ADDVU R14, R10 // ADCS R14, R10, R10 (cr=R28)
SGTU R14, R10, R30 // ...
ADDVU R28, R10 // ...
SGTU R28, R10, R28 // ...
ADDVU R30, R28 // ...
ADDVU R15, R11 // ADCS R15, R11, R11 (cr=R28)
SGTU R15, R11, R30 // ...
ADDVU R28, R11 // ...
SGTU R28, R11, R28 // ...
ADDVU R30, R28 // ...
MOVV R8, 0(R7)
MOVV R9, 8(R7)
MOVV R10, 16(R7)
MOVV R11, 24(R7)
ADDVU $32, R5
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
MOVV R28, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB),NOSPLIT,$0
// input:
// R4: z
// R5: z_len
// R7: x
// R10: y
MOVV z+0(FP), R4
MOVV z_len+8(FP), R5
MOVV x+24(FP), R7
MOVV y+48(FP), R10
MOVV $0, R6
SLLV $3, R5
MOVV $0, R8
loop:
BEQ R5, R6, done
MOVV (R6)(R7), R9
MOVV (R6)(R10), R11
SUBV R11, R9, R11 // x1 - y1 = z1', if z1' > x1 then overflow
SUBV R8, R11, R12 // z1' - c0 = z1, if z1 > z1' then overflow
SGTU R11, R9, R9
SGTU R12, R11, R11
MOVV R12, (R6)(R4)
OR R9, R11, R8
ADDV $8, R6
JMP loop
done:
MOVV R8, c+72(FP)
TEXT ·subVV(SB), NOSPLIT, $0
MOVV z_len+8(FP), R4
MOVV x_base+24(FP), R5
MOVV y_base+48(FP), R6
MOVV z_base+0(FP), R7
// compute unrolled loop lengths
AND $3, R4, R8
SRLV $2, R4
XOR R28, R28 // clear carry
loop1:
BEQ R8, loop1done
loop1cont:
// unroll 1X
MOVV 0(R5), R9
MOVV 0(R6), R10
SGTU R28, R9, R30 // SBCS R10, R9, R9
SUBVU R28, R9 // ...
SGTU R10, R9, R28 // ...
SUBVU R10, R9 // ...
ADDVU R30, R28 // ...
MOVV R9, 0(R7)
ADDVU $8, R5
ADDVU $8, R6
ADDVU $8, R7
SUBVU $1, R8
BNE R8, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 0(R5), R8
MOVV 8(R5), R9
MOVV 16(R5), R10
MOVV 24(R5), R11
MOVV 0(R6), R12
MOVV 8(R6), R13
MOVV 16(R6), R14
MOVV 24(R6), R15
SGTU R28, R8, R30 // SBCS R12, R8, R8
SUBVU R28, R8 // ...
SGTU R12, R8, R28 // ...
SUBVU R12, R8 // ...
ADDVU R30, R28 // ...
SGTU R28, R9, R30 // SBCS R13, R9, R9
SUBVU R28, R9 // ...
SGTU R13, R9, R28 // ...
SUBVU R13, R9 // ...
ADDVU R30, R28 // ...
SGTU R28, R10, R30 // SBCS R14, R10, R10
SUBVU R28, R10 // ...
SGTU R14, R10, R28 // ...
SUBVU R14, R10 // ...
ADDVU R30, R28 // ...
SGTU R28, R11, R30 // SBCS R15, R11, R11
SUBVU R28, R11 // ...
SGTU R15, R11, R28 // ...
SUBVU R15, R11 // ...
ADDVU R30, R28 // ...
MOVV R8, 0(R7)
MOVV R9, 8(R7)
MOVV R10, 16(R7)
MOVV R11, 24(R7)
ADDVU $32, R5
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
MOVV R28, c+72(FP)
RET
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVV z_len+8(FP), R4
BEQ R4, ret0
MOVV s+48(FP), R5
MOVV x_base+24(FP), R6
MOVV z_base+0(FP), R7
// run loop backward
SLLV $3, R4, R8
ADDVU R8, R6
SLLV $3, R4, R8
ADDVU R8, R7
// shift first word into carry
MOVV -8(R6), R8
MOVV $64, R9
SUBVU R5, R9
SRLV R9, R8, R10
SLLV R5, R8
MOVV R10, c+56(FP)
// shift remaining words
SUBVU $1, R4
// compute unrolled loop lengths
AND $3, R4, R10
SRLV $2, R4
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV -16(R6), R11
SRLV R9, R11, R12
OR R8, R12
SLLV R5, R11, R8
MOVV R12, -8(R7)
ADDVU $-8, R6
ADDVU $-8, R7
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV -16(R6), R10
MOVV -24(R6), R11
MOVV -32(R6), R12
MOVV -40(R6), R13
SRLV R9, R10, R14
OR R8, R14
SLLV R5, R10, R8
SRLV R9, R11, R10
OR R8, R10
SLLV R5, R11, R8
SRLV R9, R12, R11
OR R8, R11
SLLV R5, R12, R8
SRLV R9, R13, R12
OR R8, R12
SLLV R5, R13, R8
MOVV R14, -8(R7)
MOVV R10, -16(R7)
MOVV R11, -24(R7)
MOVV R12, -32(R7)
ADDVU $-32, R6
ADDVU $-32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
// store final shifted bits
MOVV R8, -8(R7)
RET
ret0:
MOVV R0, c+56(FP)
RET
TEXT ·rshVU(SB),NOSPLIT,$0
JMP ·rshVU_g(SB)
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVV z_len+8(FP), R4
BEQ R4, ret0
MOVV s+48(FP), R5
MOVV x_base+24(FP), R6
MOVV z_base+0(FP), R7
// shift first word into carry
MOVV 0(R6), R8
MOVV $64, R9
SUBVU R5, R9
SLLV R9, R8, R10
SRLV R5, R8
MOVV R10, c+56(FP)
// shift remaining words
SUBVU $1, R4
// compute unrolled loop lengths
AND $3, R4, R10
SRLV $2, R4
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV 8(R6), R11
SLLV R9, R11, R12
OR R8, R12
SRLV R5, R11, R8
MOVV R12, 0(R7)
ADDVU $8, R6
ADDVU $8, R7
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 8(R6), R10
MOVV 16(R6), R11
MOVV 24(R6), R12
MOVV 32(R6), R13
SLLV R9, R10, R14
OR R8, R14
SRLV R5, R10, R8
SLLV R9, R11, R10
OR R8, R10
SRLV R5, R11, R8
SLLV R9, R12, R11
OR R8, R11
SRLV R5, R12, R8
SLLV R9, R13, R12
OR R8, R12
SRLV R5, R13, R8
MOVV R14, 0(R7)
MOVV R10, 8(R7)
MOVV R11, 16(R7)
MOVV R12, 24(R7)
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
// store final shifted bits
MOVV R8, 0(R7)
RET
ret0:
MOVV R0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
// input:
// R4: z
// R5: z_len
// R7: x
// R10: m
// R11: a
MOVV z+0(FP), R4
MOVV z_len+8(FP), R5
MOVV x+24(FP), R7
MOVV m+48(FP), R10
MOVV a+56(FP), R11
SLLV $3, R5
MOVV $0, R6
loop:
BEQ R5, R6, done
MOVV (R6)(R7), R8
MULV R8, R10, R9
MULHVU R8, R10, R12
ADDV R9, R11, R8
SGTU R9, R8, R11 // if (c' = lo + c) < lo then overflow
MOVV R8, (R6)(R4)
ADDV R12, R11
ADDV $8, R6
JMP loop
done:
MOVV R11, c+64(FP)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVV m+48(FP), R4
MOVV a+56(FP), R5
MOVV z_len+8(FP), R6
MOVV x_base+24(FP), R7
MOVV z_base+0(FP), R8
// compute unrolled loop lengths
AND $3, R6, R9
SRLV $2, R6
loop1:
BEQ R9, loop1done
loop1cont:
// unroll 1X
MOVV 0(R7), R10
// synthetic carry, one column at a time
MULV R4, R10, R11
MULHVU R4, R10, R12
ADDVU R5, R11, R10 // ADDS R5, R11, R10 (cr=R28)
SGTU R5, R10, R28 // ...
ADDVU R28, R12, R5 // ADC $0, R12, R5
MOVV R10, 0(R8)
ADDVU $8, R7
ADDVU $8, R8
SUBVU $1, R9
BNE R9, loop1cont
loop1done:
loop4:
BEQ R6, loop4done
loop4cont:
// unroll 4X
MOVV 0(R7), R9
MOVV 8(R7), R10
MOVV 16(R7), R11
MOVV 24(R7), R12
// synthetic carry, one column at a time
MULV R4, R9, R13
MULHVU R4, R9, R14
ADDVU R5, R13, R9 // ADDS R5, R13, R9 (cr=R28)
SGTU R5, R9, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R10, R13
MULHVU R4, R10, R14
ADDVU R5, R13, R10 // ADDS R5, R13, R10 (cr=R28)
SGTU R5, R10, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R11, R13
MULHVU R4, R11, R14
ADDVU R5, R13, R11 // ADDS R5, R13, R11 (cr=R28)
SGTU R5, R11, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R12, R13
MULHVU R4, R12, R14
ADDVU R5, R13, R12 // ADDS R5, R13, R12 (cr=R28)
SGTU R5, R12, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MOVV R9, 0(R8)
MOVV R10, 8(R8)
MOVV R11, 16(R8)
MOVV R12, 24(R8)
ADDVU $32, R7
ADDVU $32, R8
SUBVU $1, R6
BNE R6, loop4cont
loop4done:
MOVV R5, c+64(FP)
RET
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVV m+72(FP), R4
MOVV a+80(FP), R5
MOVV z_len+8(FP), R6
MOVV x_base+24(FP), R7
MOVV y_base+48(FP), R8
MOVV z_base+0(FP), R9
// compute unrolled loop lengths
AND $3, R6, R10
SRLV $2, R6
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV 0(R7), R11
MOVV 0(R8), R12
// synthetic carry, one column at a time
MULV R4, R12, R13
MULHVU R4, R12, R14
ADDVU R11, R13 // ADDS R11, R13, R13 (cr=R28)
SGTU R11, R13, R28 // ...
ADDVU R28, R14 // ADC $0, R14, R14
ADDVU R5, R13, R12 // ADDS R5, R13, R12 (cr=R28)
SGTU R5, R12, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MOVV R12, 0(R9)
ADDVU $8, R7
ADDVU $8, R8
ADDVU $8, R9
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R6, loop4done
loop4cont:
// unroll 4X
MOVV 0(R7), R10
MOVV 8(R7), R11
MOVV 16(R7), R12
MOVV 24(R7), R13
MOVV 0(R8), R14
MOVV 8(R8), R15
MOVV 16(R8), R16
MOVV 24(R8), R17
// synthetic carry, one column at a time
MULV R4, R14, R18
MULHVU R4, R14, R19
ADDVU R10, R18 // ADDS R10, R18, R18 (cr=R28)
SGTU R10, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R14 // ADDS R5, R18, R14 (cr=R28)
SGTU R5, R14, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R15, R18
MULHVU R4, R15, R19
ADDVU R11, R18 // ADDS R11, R18, R18 (cr=R28)
SGTU R11, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R15 // ADDS R5, R18, R15 (cr=R28)
SGTU R5, R15, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R16, R18
MULHVU R4, R16, R19
ADDVU R12, R18 // ADDS R12, R18, R18 (cr=R28)
SGTU R12, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R16 // ADDS R5, R18, R16 (cr=R28)
SGTU R5, R16, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R17, R18
MULHVU R4, R17, R19
ADDVU R13, R18 // ADDS R13, R18, R18 (cr=R28)
SGTU R13, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R17 // ADDS R5, R18, R17 (cr=R28)
SGTU R5, R17, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MOVV R14, 0(R9)
MOVV R15, 8(R9)
MOVV R16, 16(R9)
MOVV R17, 24(R9)
ADDVU $32, R7
ADDVU $32, R8
ADDVU $32, R9
SUBVU $1, R6
BNE R6, loop4cont
loop4done:
MOVV R5, c+88(FP)
RET

View File

@ -1,29 +1,467 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go && (mips64 || mips64le)
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOVV z_len+8(FP), R1
MOVV x_base+24(FP), R2
MOVV y_base+48(FP), R3
MOVV z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R1, R5
SRLV $2, R1
XOR R26, R26 // clear carry
loop1:
BEQ R5, loop1done
loop1cont:
// unroll 1X
MOVV 0(R2), R6
MOVV 0(R3), R7
ADDVU R7, R6 // ADCS R7, R6, R6 (cr=R26)
SGTU R7, R6, R23 // ...
ADDVU R26, R6 // ...
SGTU R26, R6, R26 // ...
ADDVU R23, R26 // ...
MOVV R6, 0(R4)
ADDVU $8, R2
ADDVU $8, R3
ADDVU $8, R4
SUBVU $1, R5
BNE R5, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVV 0(R2), R5
MOVV 8(R2), R6
MOVV 16(R2), R7
MOVV 24(R2), R8
MOVV 0(R3), R9
MOVV 8(R3), R10
MOVV 16(R3), R11
MOVV 24(R3), R12
ADDVU R9, R5 // ADCS R9, R5, R5 (cr=R26)
SGTU R9, R5, R23 // ...
ADDVU R26, R5 // ...
SGTU R26, R5, R26 // ...
ADDVU R23, R26 // ...
ADDVU R10, R6 // ADCS R10, R6, R6 (cr=R26)
SGTU R10, R6, R23 // ...
ADDVU R26, R6 // ...
SGTU R26, R6, R26 // ...
ADDVU R23, R26 // ...
ADDVU R11, R7 // ADCS R11, R7, R7 (cr=R26)
SGTU R11, R7, R23 // ...
ADDVU R26, R7 // ...
SGTU R26, R7, R26 // ...
ADDVU R23, R26 // ...
ADDVU R12, R8 // ADCS R12, R8, R8 (cr=R26)
SGTU R12, R8, R23 // ...
ADDVU R26, R8 // ...
SGTU R26, R8, R26 // ...
ADDVU R23, R26 // ...
MOVV R5, 0(R4)
MOVV R6, 8(R4)
MOVV R7, 16(R4)
MOVV R8, 24(R4)
ADDVU $32, R2
ADDVU $32, R3
ADDVU $32, R4
SUBVU $1, R1
BNE R1, loop4cont
loop4done:
MOVV R26, c+72(FP)
RET
TEXT ·addVV(SB),NOSPLIT,$0
JMP ·addVV_g(SB)
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0
MOVV z_len+8(FP), R1
MOVV x_base+24(FP), R2
MOVV y_base+48(FP), R3
MOVV z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R1, R5
SRLV $2, R1
XOR R26, R26 // clear carry
loop1:
BEQ R5, loop1done
loop1cont:
// unroll 1X
MOVV 0(R2), R6
MOVV 0(R3), R7
SGTU R26, R6, R23 // SBCS R7, R6, R6
SUBVU R26, R6 // ...
SGTU R7, R6, R26 // ...
SUBVU R7, R6 // ...
ADDVU R23, R26 // ...
MOVV R6, 0(R4)
ADDVU $8, R2
ADDVU $8, R3
ADDVU $8, R4
SUBVU $1, R5
BNE R5, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVV 0(R2), R5
MOVV 8(R2), R6
MOVV 16(R2), R7
MOVV 24(R2), R8
MOVV 0(R3), R9
MOVV 8(R3), R10
MOVV 16(R3), R11
MOVV 24(R3), R12
SGTU R26, R5, R23 // SBCS R9, R5, R5
SUBVU R26, R5 // ...
SGTU R9, R5, R26 // ...
SUBVU R9, R5 // ...
ADDVU R23, R26 // ...
SGTU R26, R6, R23 // SBCS R10, R6, R6
SUBVU R26, R6 // ...
SGTU R10, R6, R26 // ...
SUBVU R10, R6 // ...
ADDVU R23, R26 // ...
SGTU R26, R7, R23 // SBCS R11, R7, R7
SUBVU R26, R7 // ...
SGTU R11, R7, R26 // ...
SUBVU R11, R7 // ...
ADDVU R23, R26 // ...
SGTU R26, R8, R23 // SBCS R12, R8, R8
SUBVU R26, R8 // ...
SGTU R12, R8, R26 // ...
SUBVU R12, R8 // ...
ADDVU R23, R26 // ...
MOVV R5, 0(R4)
MOVV R6, 8(R4)
MOVV R7, 16(R4)
MOVV R8, 24(R4)
ADDVU $32, R2
ADDVU $32, R3
ADDVU $32, R4
SUBVU $1, R1
BNE R1, loop4cont
loop4done:
MOVV R26, c+72(FP)
RET
TEXT ·subVV(SB),NOSPLIT,$0
JMP ·subVV_g(SB)
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVV z_len+8(FP), R1
BEQ R1, ret0
MOVV s+48(FP), R2
MOVV x_base+24(FP), R3
MOVV z_base+0(FP), R4
// run loop backward
SLLV $3, R1, R5
ADDVU R5, R3
SLLV $3, R1, R5
ADDVU R5, R4
// shift first word into carry
MOVV -8(R3), R5
MOVV $64, R6
SUBVU R2, R6
SRLV R6, R5, R7
SLLV R2, R5
MOVV R7, c+56(FP)
// shift remaining words
SUBVU $1, R1
// compute unrolled loop lengths
AND $3, R1, R7
SRLV $2, R1
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVV -16(R3), R8
SRLV R6, R8, R9
OR R5, R9
SLLV R2, R8, R5
MOVV R9, -8(R4)
ADDVU $-8, R3
ADDVU $-8, R4
SUBVU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVV -16(R3), R7
MOVV -24(R3), R8
MOVV -32(R3), R9
MOVV -40(R3), R10
SRLV R6, R7, R11
OR R5, R11
SLLV R2, R7, R5
SRLV R6, R8, R7
OR R5, R7
SLLV R2, R8, R5
SRLV R6, R9, R8
OR R5, R8
SLLV R2, R9, R5
SRLV R6, R10, R9
OR R5, R9
SLLV R2, R10, R5
MOVV R11, -8(R4)
MOVV R7, -16(R4)
MOVV R8, -24(R4)
MOVV R9, -32(R4)
ADDVU $-32, R3
ADDVU $-32, R4
SUBVU $1, R1
BNE R1, loop4cont
loop4done:
// store final shifted bits
MOVV R5, -8(R4)
RET
ret0:
MOVV R0, c+56(FP)
RET
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVV z_len+8(FP), R1
BEQ R1, ret0
MOVV s+48(FP), R2
MOVV x_base+24(FP), R3
MOVV z_base+0(FP), R4
// shift first word into carry
MOVV 0(R3), R5
MOVV $64, R6
SUBVU R2, R6
SLLV R6, R5, R7
SRLV R2, R5
MOVV R7, c+56(FP)
// shift remaining words
SUBVU $1, R1
// compute unrolled loop lengths
AND $3, R1, R7
SRLV $2, R1
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVV 8(R3), R8
SLLV R6, R8, R9
OR R5, R9
SRLV R2, R8, R5
MOVV R9, 0(R4)
ADDVU $8, R3
ADDVU $8, R4
SUBVU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVV 8(R3), R7
MOVV 16(R3), R8
MOVV 24(R3), R9
MOVV 32(R3), R10
SLLV R6, R7, R11
OR R5, R11
SRLV R2, R7, R5
SLLV R6, R8, R7
OR R5, R7
SRLV R2, R8, R5
SLLV R6, R9, R8
OR R5, R8
SRLV R2, R9, R5
SLLV R6, R10, R9
OR R5, R9
SRLV R2, R10, R5
MOVV R11, 0(R4)
MOVV R7, 8(R4)
MOVV R8, 16(R4)
MOVV R9, 24(R4)
ADDVU $32, R3
ADDVU $32, R4
SUBVU $1, R1
BNE R1, loop4cont
loop4done:
// store final shifted bits
MOVV R5, 0(R4)
RET
ret0:
MOVV R0, c+56(FP)
RET
TEXT ·rshVU(SB),NOSPLIT,$0
JMP ·rshVU_g(SB)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
JMP ·mulAddVWW_g(SB)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVV m+48(FP), R1
MOVV a+56(FP), R2
MOVV z_len+8(FP), R3
MOVV x_base+24(FP), R4
MOVV z_base+0(FP), R5
// compute unrolled loop lengths
AND $3, R3, R6
SRLV $2, R3
loop1:
BEQ R6, loop1done
loop1cont:
// unroll 1X
MOVV 0(R4), R7
// synthetic carry, one column at a time
MULVU R1, R7
MOVV LO, R8
MOVV HI, R9
ADDVU R2, R8, R7 // ADDS R2, R8, R7 (cr=R26)
SGTU R2, R7, R26 // ...
ADDVU R26, R9, R2 // ADC $0, R9, R2
MOVV R7, 0(R5)
ADDVU $8, R4
ADDVU $8, R5
SUBVU $1, R6
BNE R6, loop1cont
loop1done:
loop4:
BEQ R3, loop4done
loop4cont:
// unroll 4X
MOVV 0(R4), R6
MOVV 8(R4), R7
MOVV 16(R4), R8
MOVV 24(R4), R9
// synthetic carry, one column at a time
MULVU R1, R6
MOVV LO, R10
MOVV HI, R11
ADDVU R2, R10, R6 // ADDS R2, R10, R6 (cr=R26)
SGTU R2, R6, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MULVU R1, R7
MOVV LO, R10
MOVV HI, R11
ADDVU R2, R10, R7 // ADDS R2, R10, R7 (cr=R26)
SGTU R2, R7, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MULVU R1, R8
MOVV LO, R10
MOVV HI, R11
ADDVU R2, R10, R8 // ADDS R2, R10, R8 (cr=R26)
SGTU R2, R8, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MULVU R1, R9
MOVV LO, R10
MOVV HI, R11
ADDVU R2, R10, R9 // ADDS R2, R10, R9 (cr=R26)
SGTU R2, R9, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MOVV R6, 0(R5)
MOVV R7, 8(R5)
MOVV R8, 16(R5)
MOVV R9, 24(R5)
ADDVU $32, R4
ADDVU $32, R5
SUBVU $1, R3
BNE R3, loop4cont
loop4done:
MOVV R2, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVV m+72(FP), R1
MOVV a+80(FP), R2
MOVV z_len+8(FP), R3
MOVV x_base+24(FP), R4
MOVV y_base+48(FP), R5
MOVV z_base+0(FP), R6
// compute unrolled loop lengths
AND $3, R3, R7
SRLV $2, R3
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVV 0(R4), R8
MOVV 0(R5), R9
// synthetic carry, one column at a time
MULVU R1, R9
MOVV LO, R10
MOVV HI, R11
ADDVU R8, R10 // ADDS R8, R10, R10 (cr=R26)
SGTU R8, R10, R26 // ...
ADDVU R26, R11 // ADC $0, R11, R11
ADDVU R2, R10, R9 // ADDS R2, R10, R9 (cr=R26)
SGTU R2, R9, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MOVV R9, 0(R6)
ADDVU $8, R4
ADDVU $8, R5
ADDVU $8, R6
SUBVU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R3, loop4done
loop4cont:
// unroll 4X
MOVV 0(R4), R7
MOVV 8(R4), R8
MOVV 16(R4), R9
MOVV 24(R4), R10
MOVV 0(R5), R11
MOVV 8(R5), R12
MOVV 16(R5), R13
MOVV 24(R5), R14
// synthetic carry, one column at a time
MULVU R1, R11
MOVV LO, R15
MOVV HI, R16
ADDVU R7, R15 // ADDS R7, R15, R15 (cr=R26)
SGTU R7, R15, R26 // ...
ADDVU R26, R16 // ADC $0, R16, R16
ADDVU R2, R15, R11 // ADDS R2, R15, R11 (cr=R26)
SGTU R2, R11, R26 // ...
ADDVU R26, R16, R2 // ADC $0, R16, R2
MULVU R1, R12
MOVV LO, R15
MOVV HI, R16
ADDVU R8, R15 // ADDS R8, R15, R15 (cr=R26)
SGTU R8, R15, R26 // ...
ADDVU R26, R16 // ADC $0, R16, R16
ADDVU R2, R15, R12 // ADDS R2, R15, R12 (cr=R26)
SGTU R2, R12, R26 // ...
ADDVU R26, R16, R2 // ADC $0, R16, R2
MULVU R1, R13
MOVV LO, R15
MOVV HI, R16
ADDVU R9, R15 // ADDS R9, R15, R15 (cr=R26)
SGTU R9, R15, R26 // ...
ADDVU R26, R16 // ADC $0, R16, R16
ADDVU R2, R15, R13 // ADDS R2, R15, R13 (cr=R26)
SGTU R2, R13, R26 // ...
ADDVU R26, R16, R2 // ADC $0, R16, R2
MULVU R1, R14
MOVV LO, R15
MOVV HI, R16
ADDVU R10, R15 // ADDS R10, R15, R15 (cr=R26)
SGTU R10, R15, R26 // ...
ADDVU R26, R16 // ADC $0, R16, R16
ADDVU R2, R15, R14 // ADDS R2, R15, R14 (cr=R26)
SGTU R2, R14, R26 // ...
ADDVU R26, R16, R2 // ADC $0, R16, R2
MOVV R11, 0(R6)
MOVV R12, 8(R6)
MOVV R13, 16(R6)
MOVV R14, 24(R6)
ADDVU $32, R4
ADDVU $32, R5
ADDVU $32, R6
SUBVU $1, R3
BNE R3, loop4cont
loop4done:
MOVV R2, c+88(FP)
RET

View File

@ -1,29 +1,467 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go && (mips || mipsle)
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOVW z_len+4(FP), R1
MOVW x_base+12(FP), R2
MOVW y_base+24(FP), R3
MOVW z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R1, R5
SRL $2, R1
XOR R26, R26 // clear carry
loop1:
BEQ R5, loop1done
loop1cont:
// unroll 1X
MOVW 0(R2), R6
MOVW 0(R3), R7
ADDU R7, R6 // ADCS R7, R6, R6 (cr=R26)
SGTU R7, R6, R23 // ...
ADDU R26, R6 // ...
SGTU R26, R6, R26 // ...
ADDU R23, R26 // ...
MOVW R6, 0(R4)
ADDU $4, R2
ADDU $4, R3
ADDU $4, R4
SUBU $1, R5
BNE R5, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVW 0(R2), R5
MOVW 4(R2), R6
MOVW 8(R2), R7
MOVW 12(R2), R8
MOVW 0(R3), R9
MOVW 4(R3), R10
MOVW 8(R3), R11
MOVW 12(R3), R12
ADDU R9, R5 // ADCS R9, R5, R5 (cr=R26)
SGTU R9, R5, R23 // ...
ADDU R26, R5 // ...
SGTU R26, R5, R26 // ...
ADDU R23, R26 // ...
ADDU R10, R6 // ADCS R10, R6, R6 (cr=R26)
SGTU R10, R6, R23 // ...
ADDU R26, R6 // ...
SGTU R26, R6, R26 // ...
ADDU R23, R26 // ...
ADDU R11, R7 // ADCS R11, R7, R7 (cr=R26)
SGTU R11, R7, R23 // ...
ADDU R26, R7 // ...
SGTU R26, R7, R26 // ...
ADDU R23, R26 // ...
ADDU R12, R8 // ADCS R12, R8, R8 (cr=R26)
SGTU R12, R8, R23 // ...
ADDU R26, R8 // ...
SGTU R26, R8, R26 // ...
ADDU R23, R26 // ...
MOVW R5, 0(R4)
MOVW R6, 4(R4)
MOVW R7, 8(R4)
MOVW R8, 12(R4)
ADDU $16, R2
ADDU $16, R3
ADDU $16, R4
SUBU $1, R1
BNE R1, loop4cont
loop4done:
MOVW R26, c+36(FP)
RET
TEXT ·addVV(SB),NOSPLIT,$0
JMP ·addVV_g(SB)
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0
MOVW z_len+4(FP), R1
MOVW x_base+12(FP), R2
MOVW y_base+24(FP), R3
MOVW z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R1, R5
SRL $2, R1
XOR R26, R26 // clear carry
loop1:
BEQ R5, loop1done
loop1cont:
// unroll 1X
MOVW 0(R2), R6
MOVW 0(R3), R7
SGTU R26, R6, R23 // SBCS R7, R6, R6
SUBU R26, R6 // ...
SGTU R7, R6, R26 // ...
SUBU R7, R6 // ...
ADDU R23, R26 // ...
MOVW R6, 0(R4)
ADDU $4, R2
ADDU $4, R3
ADDU $4, R4
SUBU $1, R5
BNE R5, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVW 0(R2), R5
MOVW 4(R2), R6
MOVW 8(R2), R7
MOVW 12(R2), R8
MOVW 0(R3), R9
MOVW 4(R3), R10
MOVW 8(R3), R11
MOVW 12(R3), R12
SGTU R26, R5, R23 // SBCS R9, R5, R5
SUBU R26, R5 // ...
SGTU R9, R5, R26 // ...
SUBU R9, R5 // ...
ADDU R23, R26 // ...
SGTU R26, R6, R23 // SBCS R10, R6, R6
SUBU R26, R6 // ...
SGTU R10, R6, R26 // ...
SUBU R10, R6 // ...
ADDU R23, R26 // ...
SGTU R26, R7, R23 // SBCS R11, R7, R7
SUBU R26, R7 // ...
SGTU R11, R7, R26 // ...
SUBU R11, R7 // ...
ADDU R23, R26 // ...
SGTU R26, R8, R23 // SBCS R12, R8, R8
SUBU R26, R8 // ...
SGTU R12, R8, R26 // ...
SUBU R12, R8 // ...
ADDU R23, R26 // ...
MOVW R5, 0(R4)
MOVW R6, 4(R4)
MOVW R7, 8(R4)
MOVW R8, 12(R4)
ADDU $16, R2
ADDU $16, R3
ADDU $16, R4
SUBU $1, R1
BNE R1, loop4cont
loop4done:
MOVW R26, c+36(FP)
RET
TEXT ·subVV(SB),NOSPLIT,$0
JMP ·subVV_g(SB)
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVW z_len+4(FP), R1
BEQ R1, ret0
MOVW s+24(FP), R2
MOVW x_base+12(FP), R3
MOVW z_base+0(FP), R4
// run loop backward
SLL $2, R1, R5
ADDU R5, R3
SLL $2, R1, R5
ADDU R5, R4
// shift first word into carry
MOVW -4(R3), R5
MOVW $32, R6
SUBU R2, R6
SRL R6, R5, R7
SLL R2, R5
MOVW R7, c+28(FP)
// shift remaining words
SUBU $1, R1
// compute unrolled loop lengths
AND $3, R1, R7
SRL $2, R1
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVW -8(R3), R8
SRL R6, R8, R9
OR R5, R9
SLL R2, R8, R5
MOVW R9, -4(R4)
ADDU $-4, R3
ADDU $-4, R4
SUBU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVW -8(R3), R7
MOVW -12(R3), R8
MOVW -16(R3), R9
MOVW -20(R3), R10
SRL R6, R7, R11
OR R5, R11
SLL R2, R7, R5
SRL R6, R8, R7
OR R5, R7
SLL R2, R8, R5
SRL R6, R9, R8
OR R5, R8
SLL R2, R9, R5
SRL R6, R10, R9
OR R5, R9
SLL R2, R10, R5
MOVW R11, -4(R4)
MOVW R7, -8(R4)
MOVW R8, -12(R4)
MOVW R9, -16(R4)
ADDU $-16, R3
ADDU $-16, R4
SUBU $1, R1
BNE R1, loop4cont
loop4done:
// store final shifted bits
MOVW R5, -4(R4)
RET
ret0:
MOVW R0, c+28(FP)
RET
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVW z_len+4(FP), R1
BEQ R1, ret0
MOVW s+24(FP), R2
MOVW x_base+12(FP), R3
MOVW z_base+0(FP), R4
// shift first word into carry
MOVW 0(R3), R5
MOVW $32, R6
SUBU R2, R6
SLL R6, R5, R7
SRL R2, R5
MOVW R7, c+28(FP)
// shift remaining words
SUBU $1, R1
// compute unrolled loop lengths
AND $3, R1, R7
SRL $2, R1
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVW 4(R3), R8
SLL R6, R8, R9
OR R5, R9
SRL R2, R8, R5
MOVW R9, 0(R4)
ADDU $4, R3
ADDU $4, R4
SUBU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVW 4(R3), R7
MOVW 8(R3), R8
MOVW 12(R3), R9
MOVW 16(R3), R10
SLL R6, R7, R11
OR R5, R11
SRL R2, R7, R5
SLL R6, R8, R7
OR R5, R7
SRL R2, R8, R5
SLL R6, R9, R8
OR R5, R8
SRL R2, R9, R5
SLL R6, R10, R9
OR R5, R9
SRL R2, R10, R5
MOVW R11, 0(R4)
MOVW R7, 4(R4)
MOVW R8, 8(R4)
MOVW R9, 12(R4)
ADDU $16, R3
ADDU $16, R4
SUBU $1, R1
BNE R1, loop4cont
loop4done:
// store final shifted bits
MOVW R5, 0(R4)
RET
ret0:
MOVW R0, c+28(FP)
RET
TEXT ·rshVU(SB),NOSPLIT,$0
JMP ·rshVU_g(SB)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
JMP ·mulAddVWW_g(SB)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVW m+24(FP), R1
MOVW a+28(FP), R2
MOVW z_len+4(FP), R3
MOVW x_base+12(FP), R4
MOVW z_base+0(FP), R5
// compute unrolled loop lengths
AND $3, R3, R6
SRL $2, R3
loop1:
BEQ R6, loop1done
loop1cont:
// unroll 1X
MOVW 0(R4), R7
// synthetic carry, one column at a time
MULU R1, R7
MOVW LO, R8
MOVW HI, R9
ADDU R2, R8, R7 // ADDS R2, R8, R7 (cr=R26)
SGTU R2, R7, R26 // ...
ADDU R26, R9, R2 // ADC $0, R9, R2
MOVW R7, 0(R5)
ADDU $4, R4
ADDU $4, R5
SUBU $1, R6
BNE R6, loop1cont
loop1done:
loop4:
BEQ R3, loop4done
loop4cont:
// unroll 4X
MOVW 0(R4), R6
MOVW 4(R4), R7
MOVW 8(R4), R8
MOVW 12(R4), R9
// synthetic carry, one column at a time
MULU R1, R6
MOVW LO, R10
MOVW HI, R11
ADDU R2, R10, R6 // ADDS R2, R10, R6 (cr=R26)
SGTU R2, R6, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MULU R1, R7
MOVW LO, R10
MOVW HI, R11
ADDU R2, R10, R7 // ADDS R2, R10, R7 (cr=R26)
SGTU R2, R7, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MULU R1, R8
MOVW LO, R10
MOVW HI, R11
ADDU R2, R10, R8 // ADDS R2, R10, R8 (cr=R26)
SGTU R2, R8, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MULU R1, R9
MOVW LO, R10
MOVW HI, R11
ADDU R2, R10, R9 // ADDS R2, R10, R9 (cr=R26)
SGTU R2, R9, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MOVW R6, 0(R5)
MOVW R7, 4(R5)
MOVW R8, 8(R5)
MOVW R9, 12(R5)
ADDU $16, R4
ADDU $16, R5
SUBU $1, R3
BNE R3, loop4cont
loop4done:
MOVW R2, c+32(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVW m+36(FP), R1
MOVW a+40(FP), R2
MOVW z_len+4(FP), R3
MOVW x_base+12(FP), R4
MOVW y_base+24(FP), R5
MOVW z_base+0(FP), R6
// compute unrolled loop lengths
AND $3, R3, R7
SRL $2, R3
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVW 0(R4), R8
MOVW 0(R5), R9
// synthetic carry, one column at a time
MULU R1, R9
MOVW LO, R10
MOVW HI, R11
ADDU R8, R10 // ADDS R8, R10, R10 (cr=R26)
SGTU R8, R10, R26 // ...
ADDU R26, R11 // ADC $0, R11, R11
ADDU R2, R10, R9 // ADDS R2, R10, R9 (cr=R26)
SGTU R2, R9, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MOVW R9, 0(R6)
ADDU $4, R4
ADDU $4, R5
ADDU $4, R6
SUBU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R3, loop4done
loop4cont:
// unroll 4X
MOVW 0(R4), R7
MOVW 4(R4), R8
MOVW 8(R4), R9
MOVW 12(R4), R10
MOVW 0(R5), R11
MOVW 4(R5), R12
MOVW 8(R5), R13
MOVW 12(R5), R14
// synthetic carry, one column at a time
MULU R1, R11
MOVW LO, R15
MOVW HI, R16
ADDU R7, R15 // ADDS R7, R15, R15 (cr=R26)
SGTU R7, R15, R26 // ...
ADDU R26, R16 // ADC $0, R16, R16
ADDU R2, R15, R11 // ADDS R2, R15, R11 (cr=R26)
SGTU R2, R11, R26 // ...
ADDU R26, R16, R2 // ADC $0, R16, R2
MULU R1, R12
MOVW LO, R15
MOVW HI, R16
ADDU R8, R15 // ADDS R8, R15, R15 (cr=R26)
SGTU R8, R15, R26 // ...
ADDU R26, R16 // ADC $0, R16, R16
ADDU R2, R15, R12 // ADDS R2, R15, R12 (cr=R26)
SGTU R2, R12, R26 // ...
ADDU R26, R16, R2 // ADC $0, R16, R2
MULU R1, R13
MOVW LO, R15
MOVW HI, R16
ADDU R9, R15 // ADDS R9, R15, R15 (cr=R26)
SGTU R9, R15, R26 // ...
ADDU R26, R16 // ADC $0, R16, R16
ADDU R2, R15, R13 // ADDS R2, R15, R13 (cr=R26)
SGTU R2, R13, R26 // ...
ADDU R26, R16, R2 // ADC $0, R16, R2
MULU R1, R14
MOVW LO, R15
MOVW HI, R16
ADDU R10, R15 // ADDS R10, R15, R15 (cr=R26)
SGTU R10, R15, R26 // ...
ADDU R26, R16 // ADC $0, R16, R16
ADDU R2, R15, R14 // ADDS R2, R15, R14 (cr=R26)
SGTU R2, R14, R26 // ...
ADDU R26, R16, R2 // ADC $0, R16, R2
MOVW R11, 0(R6)
MOVW R12, 4(R6)
MOVW R13, 8(R6)
MOVW R14, 12(R6)
ADDU $16, R4
ADDU $16, R5
ADDU $16, R6
SUBU $1, R3
BNE R3, loop4cont
loop4done:
MOVW R2, c+44(FP)
RET

View File

@ -1,469 +1,386 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go && (ppc64 || ppc64le)
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func addVV(z, y, y []Word) (c Word)
// z[i] = x[i] + y[i] for all i, carrying
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7 // R7 = z_len
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 // R9 = y[]
MOVD z+0(FP), R10 // R10 = z[]
// If z_len = 0, we are done
CMP R7, $0
MOVD R0, R4
BEQ done
// Process the first iteration out of the loop so we can
// use MOVDU and avoid 3 index registers updates.
MOVD 0(R8), R11 // R11 = x[i]
MOVD 0(R9), R12 // R12 = y[i]
ADD $-1, R7 // R7 = z_len - 1
ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
CMP R7, $0
MOVD R15, 0(R10) // z[i]
BEQ final // If z_len was 1, we are done
SRD $2, R7, R5 // R5 = z_len/4
CMP R5, $0
MOVD R5, CTR // Set up loop counter
BEQ tail // If R5 = 0, we can't use the loop
// Process 4 elements per iteration. Unrolling this loop
// means a performance trade-off: we will lose performance
// for small values of z_len (0.90x in the worst case), but
// gain significant performance as z_len increases (up to
// 1.45x).
PCALIGN $16
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
MOVD 24(R8), R14 // R14 = x[i+2]
MOVDU 32(R8), R15 // R15 = x[i+3]
MOVD 8(R9), R16 // R16 = y[i]
MOVD 16(R9), R17 // R17 = y[i+1]
MOVD 24(R9), R18 // R18 = y[i+2]
MOVDU 32(R9), R19 // R19 = y[i+3]
ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
MOVD R20, 8(R10) // z[i]
MOVD R21, 16(R10) // z[i+1]
MOVD R22, 24(R10) // z[i+2]
MOVDU R23, 32(R10) // z[i+3]
ADD $-4, R7 // R7 = z_len - 4
BDNZ loop
// We may have more elements to read
CMP R7, $0
BEQ final
// Process the remaining elements, one at a time
tail:
MOVDU 8(R8), R11 // R11 = x[i]
MOVDU 8(R9), R16 // R16 = y[i]
ADD $-1, R7 // R7 = z_len - 1
ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
CMP R7, $0
MOVDU R20, 8(R10) // z[i]
BEQ final // If R7 = 0, we are done
MOVDU 8(R8), R11
MOVDU 8(R9), R16
ADD $-1, R7
ADDE R11, R16, R20
CMP R7, $0
MOVDU R20, 8(R10)
BEQ final
MOVD 8(R8), R11
MOVD 8(R9), R16
ADDE R11, R16, R20
MOVD R20, 8(R10)
final:
ADDZE R4 // Capture CA
done:
MOVD R4, c+72(FP)
MOVD z_len+8(FP), R3
MOVD x_base+24(FP), R4
MOVD y_base+48(FP), R5
MOVD z_base+0(FP), R6
// compute unrolled loop lengths
ANDCC $3, R3, R7
SRD $2, R3
ADDC R0, R3 // clear carry
loop1:
CMP R7, $0; BEQ loop1done; MOVD R7, CTR
loop1cont:
// unroll 1X
MOVD 0(R4), R8
MOVD 0(R5), R9
ADDE R9, R8
MOVD R8, 0(R6)
ADD $8, R4
ADD $8, R5
ADD $8, R6
BDNZ loop1cont
loop1done:
loop4:
CMP R3, $0; BEQ loop4done; MOVD R3, CTR
loop4cont:
// unroll 4X
MOVD 0(R4), R7
MOVD 8(R4), R8
MOVD 16(R4), R9
MOVD 24(R4), R10
MOVD 0(R5), R11
MOVD 8(R5), R12
MOVD 16(R5), R14
MOVD 24(R5), R15
ADDE R11, R7
ADDE R12, R8
ADDE R14, R9
ADDE R15, R10
MOVD R7, 0(R6)
MOVD R8, 8(R6)
MOVD R9, 16(R6)
MOVD R10, 24(R6)
ADD $32, R4
ADD $32, R5
ADD $32, R6
BDNZ loop4cont
loop4done:
ADDE R0, R0, R4 // save & convert add carry
MOVD R4, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
// z[i] = x[i] - y[i] for all i, carrying
TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7 // R7 = z_len
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 // R9 = y[]
MOVD z+0(FP), R10 // R10 = z[]
// If z_len = 0, we are done
CMP R7, $0
MOVD R0, R4
BEQ done
// Process the first iteration out of the loop so we can
// use MOVDU and avoid 3 index registers updates.
MOVD 0(R8), R11 // R11 = x[i]
MOVD 0(R9), R12 // R12 = y[i]
ADD $-1, R7 // R7 = z_len - 1
SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
CMP R7, $0
MOVD R15, 0(R10) // z[i]
BEQ final // If z_len was 1, we are done
SRD $2, R7, R5 // R5 = z_len/4
CMP R5, $0
MOVD R5, CTR // Set up loop counter
BEQ tail // If R5 = 0, we can't use the loop
// Process 4 elements per iteration. Unrolling this loop
// means a performance trade-off: we will lose performance
// for small values of z_len (0.92x in the worst case), but
// gain significant performance as z_len increases (up to
// 1.45x).
PCALIGN $16
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
MOVD 24(R8), R14 // R14 = x[i+2]
MOVDU 32(R8), R15 // R15 = x[i+3]
MOVD 8(R9), R16 // R16 = y[i]
MOVD 16(R9), R17 // R17 = y[i+1]
MOVD 24(R9), R18 // R18 = y[i+2]
MOVDU 32(R9), R19 // R19 = y[i+3]
SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
MOVD R20, 8(R10) // z[i]
MOVD R21, 16(R10) // z[i+1]
MOVD R22, 24(R10) // z[i+2]
MOVDU R23, 32(R10) // z[i+3]
ADD $-4, R7 // R7 = z_len - 4
BDNZ loop
// We may have more elements to read
CMP R7, $0
BEQ final
// Process the remaining elements, one at a time
tail:
MOVDU 8(R8), R11 // R11 = x[i]
MOVDU 8(R9), R16 // R16 = y[i]
ADD $-1, R7 // R7 = z_len - 1
SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
CMP R7, $0
MOVDU R20, 8(R10) // z[i]
BEQ final // If R7 = 0, we are done
MOVDU 8(R8), R11
MOVDU 8(R9), R16
ADD $-1, R7
SUBE R16, R11, R20
CMP R7, $0
MOVDU R20, 8(R10)
BEQ final
MOVD 8(R8), R11
MOVD 8(R9), R16
SUBE R16, R11, R20
MOVD R20, 8(R10)
final:
ADDZE R4
XOR $1, R4
done:
MOVD R4, c+72(FP)
MOVD z_len+8(FP), R3
MOVD x_base+24(FP), R4
MOVD y_base+48(FP), R5
MOVD z_base+0(FP), R6
// compute unrolled loop lengths
ANDCC $3, R3, R7
SRD $2, R3
SUBC R0, R3 // clear carry
loop1:
CMP R7, $0; BEQ loop1done; MOVD R7, CTR
loop1cont:
// unroll 1X
MOVD 0(R4), R8
MOVD 0(R5), R9
SUBE R9, R8
MOVD R8, 0(R6)
ADD $8, R4
ADD $8, R5
ADD $8, R6
BDNZ loop1cont
loop1done:
loop4:
CMP R3, $0; BEQ loop4done; MOVD R3, CTR
loop4cont:
// unroll 4X
MOVD 0(R4), R7
MOVD 8(R4), R8
MOVD 16(R4), R9
MOVD 24(R4), R10
MOVD 0(R5), R11
MOVD 8(R5), R12
MOVD 16(R5), R14
MOVD 24(R5), R15
SUBE R11, R7
SUBE R12, R8
SUBE R14, R9
SUBE R15, R10
MOVD R7, 0(R6)
MOVD R8, 8(R6)
MOVD R9, 16(R6)
MOVD R10, 24(R6)
ADD $32, R4
ADD $32, R5
ADD $32, R6
BDNZ loop4cont
loop4done:
SUBE R4, R4 // save carry
SUB R4, R0, R4 // convert sub carry
MOVD R4, c+72(FP)
RET
//func lshVU(z, x []Word, s uint) (c Word)
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVD z+0(FP), R3
MOVD x+24(FP), R6
MOVD s+48(FP), R9
MOVD z_len+8(FP), R4
MOVD x_len+32(FP), R7
CMP R4, $0 // len(z)==0 return
BEQ done
ADD $-1, R4, R5 // len(z)-1
SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
SLD $3, R5, R7
ADD R6, R7, R15 // save starting address &x[len(z)-1]
ADD R3, R7, R16 // save starting address &z[len(z)-1]
MOVD (R6)(R7), R14
SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7
CMP R5, $0 // iterate from i=len(z)-1 to 0
BEQ loopexit // Already at end?
MOVD 0(R15),R10 // x[i]
PCALIGN $16
shloop:
SLD R9, R10, R10 // x[i]<<s
MOVDU -8(R15), R14
SRD R4, R14, R11 // x[i-1]>>ŝ
OR R11, R10, R10
MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ
MOVD R14, R10 // reuse x[i-1] for next iteration
ADD $-8, R16 // i--
CMP R15, R6 // &x[i-1]>&x[0]?
BGT shloop
loopexit:
MOVD 0(R6), R4
SLD R9, R4, R4
MOVD R4, 0(R3) // z[0]=x[0]<<s
MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c
MOVD z_len+8(FP), R3
CMP R3, $0; BEQ ret0
MOVD s+48(FP), R4
MOVD x_base+24(FP), R5
MOVD z_base+0(FP), R6
// run loop backward
SLD $3, R3, R7
ADD R7, R5
SLD $3, R3, R7
ADD R7, R6
// shift first word into carry
MOVD -8(R5), R7
MOVD $64, R8
SUB R4, R8
SRD R8, R7, R9
SLD R4, R7
MOVD R9, c+56(FP)
// shift remaining words
SUB $1, R3
// compute unrolled loop lengths
ANDCC $3, R3, R9
SRD $2, R3
loop1:
CMP R9, $0; BEQ loop1done; MOVD R9, CTR
loop1cont:
// unroll 1X
MOVD -16(R5), R10
SRD R8, R10, R11
OR R7, R11
SLD R4, R10, R7
MOVD R11, -8(R6)
ADD $-8, R5
ADD $-8, R6
BDNZ loop1cont
loop1done:
loop4:
CMP R3, $0; BEQ loop4done; MOVD R3, CTR
loop4cont:
// unroll 4X
MOVD -16(R5), R9
MOVD -24(R5), R10
MOVD -32(R5), R11
MOVD -40(R5), R12
SRD R8, R9, R14
OR R7, R14
SLD R4, R9, R7
SRD R8, R10, R9
OR R7, R9
SLD R4, R10, R7
SRD R8, R11, R10
OR R7, R10
SLD R4, R11, R7
SRD R8, R12, R11
OR R7, R11
SLD R4, R12, R7
MOVD R14, -8(R6)
MOVD R9, -16(R6)
MOVD R10, -24(R6)
MOVD R11, -32(R6)
ADD $-32, R5
ADD $-32, R6
BDNZ loop4cont
loop4done:
// store final shifted bits
MOVD R7, -8(R6)
RET
done:
MOVD R0, c+56(FP) // c=0
ret0:
MOVD R0, c+56(FP)
RET
//func rshVU(z, x []Word, s uint) (c Word)
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVD z+0(FP), R3
MOVD x+24(FP), R6
MOVD s+48(FP), R9
MOVD z_len+8(FP), R4
MOVD x_len+32(FP), R7
CMP R4, $0 // len(z)==0 return
BEQ done
SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
MOVD 0(R6), R7
SLD R5, R7, R7 // compute x[0]<<ŝ
MOVD $1, R8 // iterate from i=1 to i<len(z)
CMP R8, R4
BGE loopexit // Already at end?
// vectorize if len(z) is >=3, else jump to scalar loop
CMP R4, $3
BLT scalar
MTVSRD R9, VS38 // s
VSPLTB $7, V6, V4
MTVSRD R5, VS39 // ŝ
VSPLTB $7, V7, V2
ADD $-2, R4, R16
PCALIGN $16
loopback:
ADD $-1, R8, R10
SLD $3, R10
LXVD2X (R6)(R10), VS32 // load x[i-1], x[i]
SLD $3, R8, R12
LXVD2X (R6)(R12), VS33 // load x[i], x[i+1]
VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s
VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ
VOR V3, V5, V5 // Or(|) the two registers together
STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
ADD $2, R8 // Done processing 2 entries, i and i+1
CMP R8, R16 // Are there at least a couple of more entries left?
BLE loopback
CMP R8, R4 // Are we at the last element?
BEQ loopexit
scalar:
ADD $-1, R8, R10
SLD $3, R10
MOVD (R6)(R10),R11
SRD R9, R11, R11 // x[len(z)-2] >> s
SLD $3, R8, R12
MOVD (R6)(R12), R12
SLD R5, R12, R12 // x[len(z)-1]<<ŝ
OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
loopexit:
ADD $-1, R4
SLD $3, R4
MOVD (R6)(R4), R5
SRD R9, R5, R5 // x[len(z)-1]>>s
MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s
MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c
MOVD z_len+8(FP), R3
CMP R3, $0; BEQ ret0
MOVD s+48(FP), R4
MOVD x_base+24(FP), R5
MOVD z_base+0(FP), R6
// shift first word into carry
MOVD 0(R5), R7
MOVD $64, R8
SUB R4, R8
SLD R8, R7, R9
SRD R4, R7
MOVD R9, c+56(FP)
// shift remaining words
SUB $1, R3
// compute unrolled loop lengths
ANDCC $3, R3, R9
SRD $2, R3
loop1:
CMP R9, $0; BEQ loop1done; MOVD R9, CTR
loop1cont:
// unroll 1X
MOVD 8(R5), R10
SLD R8, R10, R11
OR R7, R11
SRD R4, R10, R7
MOVD R11, 0(R6)
ADD $8, R5
ADD $8, R6
BDNZ loop1cont
loop1done:
loop4:
CMP R3, $0; BEQ loop4done; MOVD R3, CTR
loop4cont:
// unroll 4X
MOVD 8(R5), R9
MOVD 16(R5), R10
MOVD 24(R5), R11
MOVD 32(R5), R12
SLD R8, R9, R14
OR R7, R14
SRD R4, R9, R7
SLD R8, R10, R9
OR R7, R9
SRD R4, R10, R7
SLD R8, R11, R10
OR R7, R10
SRD R4, R11, R7
SLD R8, R12, R11
OR R7, R11
SRD R4, R12, R7
MOVD R14, 0(R6)
MOVD R9, 8(R6)
MOVD R10, 16(R6)
MOVD R11, 24(R6)
ADD $32, R5
ADD $32, R6
BDNZ loop4cont
loop4done:
// store final shifted bits
MOVD R7, 0(R6)
RET
done:
MOVD R0, c+56(FP)
ret0:
MOVD R0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD m+48(FP), R9 // R9 = m
MOVD a+56(FP), R4 // R4 = a = c
MOVD z_len+8(FP), R11 // R11 = z_len
CMP R11, $0
BEQ done
MOVD 0(R8), R20
ADD $-1, R11
MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
ADDC R4, R6 // R6 = z0 + r
ADDZE R7, R4 // R4 = z1 + CA
CMP R11, $0
MOVD R6, 0(R10) // z[i]
BEQ done
// We will read 4 elements per iteration
SRDCC $2, R11, R14 // R14 = z_len/4
DCBT (R8)
MOVD R14, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
PCALIGN $16
loop:
MOVD 8(R8), R20 // R20 = x[i]
MOVD 16(R8), R21 // R21 = x[i+1]
MOVD 24(R8), R22 // R22 = x[i+2]
MOVDU 32(R8), R23 // R23 = x[i+3]
MULLD R9, R20, R24 // R24 = z0[i]
MULHDU R9, R20, R20 // R20 = z1[i]
ADDC R4, R24 // R24 = z0[i] + c
MULLD R9, R21, R25
MULHDU R9, R21, R21
ADDE R20, R25
MULLD R9, R22, R26
MULHDU R9, R22, R22
MULLD R9, R23, R27
MULHDU R9, R23, R23
ADDE R21, R26
MOVD R24, 8(R10) // z[i]
MOVD R25, 16(R10) // z[i+1]
ADDE R22, R27
ADDZE R23,R4 // update carry
MOVD R26, 24(R10) // z[i+2]
MOVDU R27, 32(R10) // z[i+3]
ADD $-4, R11 // R11 = z_len - 4
BDNZ loop
// We may have some elements to read
CMP R11, $0
BEQ done
// Process the remaining elements, one at a time
tail:
MOVDU 8(R8), R20 // R20 = x[i]
MULLD R9, R20, R24 // R24 = z0[i]
MULHDU R9, R20, R25 // R25 = z1[i]
ADD $-1, R11 // R11 = z_len - 1
ADDC R4, R24
ADDZE R25, R4
MOVDU R24, 8(R10) // z[i]
CMP R11, $0
BEQ done // If R11 = 0, we are done
MOVDU 8(R8), R20
MULLD R9, R20, R24
MULHDU R9, R20, R25
ADD $-1, R11
ADDC R4, R24
ADDZE R25, R4
MOVDU R24, 8(R10)
CMP R11, $0
BEQ done
MOVD 8(R8), R20
MULLD R9, R20, R24
MULHDU R9, R20, R25
ADD $-1, R11
ADDC R4, R24
ADDZE R25,R4
MOVD R24, 8(R10)
done:
MOVD R4, c+64(FP)
MOVD m+48(FP), R3
MOVD a+56(FP), R4
MOVD z_len+8(FP), R5
MOVD x_base+24(FP), R6
MOVD z_base+0(FP), R7
// compute unrolled loop lengths
ANDCC $3, R5, R8
SRD $2, R5
loop1:
CMP R8, $0; BEQ loop1done; MOVD R8, CTR
loop1cont:
// unroll 1X
MOVD 0(R6), R9
// multiply
MULHDU R3, R9, R10
MULLD R3, R9
ADDC R4, R9
ADDE R0, R10, R4
MOVD R9, 0(R7)
ADD $8, R6
ADD $8, R7
BDNZ loop1cont
loop1done:
loop4:
CMP R5, $0; BEQ loop4done; MOVD R5, CTR
loop4cont:
// unroll 4X
MOVD 0(R6), R8
MOVD 8(R6), R9
MOVD 16(R6), R10
MOVD 24(R6), R11
// multiply
MULHDU R3, R8, R12
MULLD R3, R8
ADDC R4, R8
MULHDU R3, R9, R14
MULLD R3, R9
ADDE R12, R9
MULHDU R3, R10, R12
MULLD R3, R10
ADDE R14, R10
MULHDU R3, R11, R14
MULLD R3, R11
ADDE R12, R11
ADDE R0, R14, R4
MOVD R8, 0(R7)
MOVD R9, 8(R7)
MOVD R10, 16(R7)
MOVD R11, 24(R7)
ADD $32, R6
ADD $32, R7
BDNZ loop4cont
loop4done:
MOVD R4, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R22 // R22 = z[]
MOVD x+24(FP), R3 // R3 = x[]
MOVD y+48(FP), R4 // R4 = y[]
MOVD m+72(FP), R5 // R5 = m
MOVD z_len+8(FP), R6 // R6 = z_len
CMP R6, $4
MOVD a+80(FP), R9 // R9 = c = a
BLT tail
SRD $2, R6, R7
MOVD R7, CTR // Initialize loop counter
PCALIGN $16
loop:
MOVD 0(R4), R14 // y[i]
MOVD 8(R4), R16 // y[i+1]
MOVD 16(R4), R18 // y[i+2]
MOVD 24(R4), R20 // y[i+3]
MOVD 0(R3), R15 // x[i]
MOVD 8(R3), R17 // x[i+1]
MOVD 16(R3), R19 // x[i+2]
MOVD 24(R3), R21 // x[i+3]
MULLD R5, R14, R10 // low y[i]*m
MULHDU R5, R14, R11 // high y[i]*m
ADDC R15, R10
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MULLD R5, R16, R14 // low y[i+1]*m
MULHDU R5, R16, R15 // high y[i+1]*m
ADDC R17, R14
ADDZE R15
ADDC R9, R14
ADDZE R15, R9
MULLD R5, R18, R16 // low y[i+2]*m
MULHDU R5, R18, R17 // high y[i+2]*m
ADDC R19, R16
ADDZE R17
ADDC R9, R16
ADDZE R17, R9
MULLD R5, R20, R18 // low y[i+3]*m
MULHDU R5, R20, R19 // high y[i+3]*m
ADDC R21, R18
ADDZE R19
ADDC R9, R18
ADDZE R19, R9
MOVD R10, 0(R22) // z[i]
MOVD R14, 8(R22) // z[i+1]
MOVD R16, 16(R22) // z[i+2]
MOVD R18, 24(R22) // z[i+3]
ADD $32, R3
ADD $32, R4
ADD $32, R22
BDNZ loop
ANDCC $3, R6
tail:
CMP R6, $0
BEQ done
MOVD R6, CTR
PCALIGN $16
tailloop:
MOVD 0(R4), R14
MOVD 0(R3), R15
MULLD R5, R14, R10
MULHDU R5, R14, R11
ADDC R15, R10
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MOVD R10, 0(R22)
ADD $8, R3
ADD $8, R4
ADD $8, R22
BDNZ tailloop
done:
MOVD R9, c+88(FP)
MOVD m+72(FP), R3
MOVD a+80(FP), R4
MOVD z_len+8(FP), R5
MOVD x_base+24(FP), R6
MOVD y_base+48(FP), R7
MOVD z_base+0(FP), R8
// compute unrolled loop lengths
ANDCC $3, R5, R9
SRD $2, R5
loop1:
CMP R9, $0; BEQ loop1done; MOVD R9, CTR
loop1cont:
// unroll 1X
MOVD 0(R6), R10
MOVD 0(R7), R11
// multiply
MULHDU R3, R11, R12
MULLD R3, R11
ADDC R4, R11
ADDE R0, R12, R4
// add
ADDC R10, R11
ADDE R0, R4
MOVD R11, 0(R8)
ADD $8, R6
ADD $8, R7
ADD $8, R8
BDNZ loop1cont
loop1done:
loop4:
CMP R5, $0; BEQ loop4done; MOVD R5, CTR
loop4cont:
// unroll 4X
MOVD 0(R6), R9
MOVD 8(R6), R10
MOVD 16(R6), R11
MOVD 24(R6), R12
MOVD 0(R7), R14
MOVD 8(R7), R15
MOVD 16(R7), R16
MOVD 24(R7), R17
// multiply
MULHDU R3, R14, R18
MULLD R3, R14
ADDC R4, R14
MULHDU R3, R15, R19
MULLD R3, R15
ADDE R18, R15
MULHDU R3, R16, R18
MULLD R3, R16
ADDE R19, R16
MULHDU R3, R17, R19
MULLD R3, R17
ADDE R18, R17
ADDE R0, R19, R4
// add
ADDC R9, R14
ADDE R10, R15
ADDE R11, R16
ADDE R12, R17
ADDE R0, R4
MOVD R14, 0(R8)
MOVD R15, 8(R8)
MOVD R16, 16(R8)
MOVD R17, 24(R8)
ADD $32, R6
ADD $32, R7
ADD $32, R8
BDNZ loop4cont
loop4done:
MOVD R4, c+88(FP)
RET

View File

@ -1,353 +1,457 @@
// Copyright 2020 The Go Authors. All rights reserved.
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !math_big_pure_go && riscv64
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
TEXT ·addVV(SB),NOSPLIT,$0
MOV x+24(FP), X5
MOV y+48(FP), X6
MOV z+0(FP), X7
MOV z_len+8(FP), X30
MOV $4, X28
MOV $0, X29 // c = 0
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 0(X6), X9 // y[0]
MOV 8(X5), X11 // x[1]
MOV 8(X6), X12 // y[1]
MOV 16(X5), X14 // x[2]
MOV 16(X6), X15 // y[2]
MOV 24(X5), X17 // x[3]
MOV 24(X6), X18 // y[3]
ADD X8, X9, X21 // z[0] = x[0] + y[0]
SLTU X8, X21, X22
ADD X21, X29, X10 // z[0] = x[0] + y[0] + c
SLTU X21, X10, X23
ADD X22, X23, X29 // next c
ADD X11, X12, X24 // z[1] = x[1] + y[1]
SLTU X11, X24, X25
ADD X24, X29, X13 // z[1] = x[1] + y[1] + c
SLTU X24, X13, X26
ADD X25, X26, X29 // next c
ADD X14, X15, X21 // z[2] = x[2] + y[2]
SLTU X14, X21, X22
ADD X21, X29, X16 // z[2] = x[2] + y[2] + c
SLTU X21, X16, X23
ADD X22, X23, X29 // next c
ADD X17, X18, X21 // z[3] = x[3] + y[3]
SLTU X17, X21, X22
ADD X21, X29, X19 // z[3] = x[3] + y[3] + c
SLTU X21, X19, X23
ADD X22, X23, X29 // next c
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X6
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOV z_len+8(FP), X5
MOV x_base+24(FP), X6
MOV y_base+48(FP), X7
MOV z_base+0(FP), X8
// compute unrolled loop lengths
AND $3, X5, X9
SRL $2, X5
XOR X28, X28 // clear carry
loop1:
MOV 0(X5), X10 // x
MOV 0(X6), X11 // y
ADD X10, X11, X12 // z = x + y
SLTU X10, X12, X14
ADD X12, X29, X13 // z = x + y + c
SLTU X12, X13, X15
ADD X14, X15, X29 // next c
MOV X13, 0(X7) // z
ADD $8, X5
ADD $8, X6
ADD $8, X7
SUB $1, X30
BNEZ X30, loop1
done:
MOV X29, c+72(FP) // return c
BEQZ X9, loop1done
loop1cont:
// unroll 1X
MOV 0(X6), X10
MOV 0(X7), X11
ADD X11, X10 // ADCS X11, X10, X10 (cr=X28)
SLTU X11, X10, X31 // ...
ADD X28, X10 // ...
SLTU X28, X10, X28 // ...
ADD X31, X28 // ...
MOV X10, 0(X8)
ADD $8, X6
ADD $8, X7
ADD $8, X8
SUB $1, X9
BNEZ X9, loop1cont
loop1done:
loop4:
BEQZ X5, loop4done
loop4cont:
// unroll 4X
MOV 0(X6), X9
MOV 8(X6), X10
MOV 16(X6), X11
MOV 24(X6), X12
MOV 0(X7), X13
MOV 8(X7), X14
MOV 16(X7), X15
MOV 24(X7), X16
ADD X13, X9 // ADCS X13, X9, X9 (cr=X28)
SLTU X13, X9, X31 // ...
ADD X28, X9 // ...
SLTU X28, X9, X28 // ...
ADD X31, X28 // ...
ADD X14, X10 // ADCS X14, X10, X10 (cr=X28)
SLTU X14, X10, X31 // ...
ADD X28, X10 // ...
SLTU X28, X10, X28 // ...
ADD X31, X28 // ...
ADD X15, X11 // ADCS X15, X11, X11 (cr=X28)
SLTU X15, X11, X31 // ...
ADD X28, X11 // ...
SLTU X28, X11, X28 // ...
ADD X31, X28 // ...
ADD X16, X12 // ADCS X16, X12, X12 (cr=X28)
SLTU X16, X12, X31 // ...
ADD X28, X12 // ...
SLTU X28, X12, X28 // ...
ADD X31, X28 // ...
MOV X9, 0(X8)
MOV X10, 8(X8)
MOV X11, 16(X8)
MOV X12, 24(X8)
ADD $32, X6
ADD $32, X7
ADD $32, X8
SUB $1, X5
BNEZ X5, loop4cont
loop4done:
MOV X28, c+72(FP)
RET
TEXT ·subVV(SB),NOSPLIT,$0
MOV x+24(FP), X5
MOV y+48(FP), X6
MOV z+0(FP), X7
MOV z_len+8(FP), X30
MOV $4, X28
MOV $0, X29 // b = 0
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 0(X6), X9 // y[0]
MOV 8(X5), X11 // x[1]
MOV 8(X6), X12 // y[1]
MOV 16(X5), X14 // x[2]
MOV 16(X6), X15 // y[2]
MOV 24(X5), X17 // x[3]
MOV 24(X6), X18 // y[3]
SUB X9, X8, X21 // z[0] = x[0] - y[0]
SLTU X21, X8, X22
SUB X29, X21, X10 // z[0] = x[0] - y[0] - b
SLTU X10, X21, X23
ADD X22, X23, X29 // next b
SUB X12, X11, X24 // z[1] = x[1] - y[1]
SLTU X24, X11, X25
SUB X29, X24, X13 // z[1] = x[1] - y[1] - b
SLTU X13, X24, X26
ADD X25, X26, X29 // next b
SUB X15, X14, X21 // z[2] = x[2] - y[2]
SLTU X21, X14, X22
SUB X29, X21, X16 // z[2] = x[2] - y[2] - b
SLTU X16, X21, X23
ADD X22, X23, X29 // next b
SUB X18, X17, X21 // z[3] = x[3] - y[3]
SLTU X21, X17, X22
SUB X29, X21, X19 // z[3] = x[3] - y[3] - b
SLTU X19, X21, X23
ADD X22, X23, X29 // next b
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X6
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0
MOV z_len+8(FP), X5
MOV x_base+24(FP), X6
MOV y_base+48(FP), X7
MOV z_base+0(FP), X8
// compute unrolled loop lengths
AND $3, X5, X9
SRL $2, X5
XOR X28, X28 // clear carry
loop1:
MOV 0(X5), X10 // x
MOV 0(X6), X11 // y
SUB X11, X10, X12 // z = x - y
SLTU X12, X10, X14
SUB X29, X12, X13 // z = x - y - b
SLTU X13, X12, X15
ADD X14, X15, X29 // next b
MOV X13, 0(X7) // z
ADD $8, X5
ADD $8, X6
ADD $8, X7
SUB $1, X30
BNEZ X30, loop1
done:
MOV X29, c+72(FP) // return b
BEQZ X9, loop1done
loop1cont:
// unroll 1X
MOV 0(X6), X10
MOV 0(X7), X11
SLTU X28, X10, X31 // SBCS X11, X10, X10
SUB X28, X10 // ...
SLTU X11, X10, X28 // ...
SUB X11, X10 // ...
ADD X31, X28 // ...
MOV X10, 0(X8)
ADD $8, X6
ADD $8, X7
ADD $8, X8
SUB $1, X9
BNEZ X9, loop1cont
loop1done:
loop4:
BEQZ X5, loop4done
loop4cont:
// unroll 4X
MOV 0(X6), X9
MOV 8(X6), X10
MOV 16(X6), X11
MOV 24(X6), X12
MOV 0(X7), X13
MOV 8(X7), X14
MOV 16(X7), X15
MOV 24(X7), X16
SLTU X28, X9, X31 // SBCS X13, X9, X9
SUB X28, X9 // ...
SLTU X13, X9, X28 // ...
SUB X13, X9 // ...
ADD X31, X28 // ...
SLTU X28, X10, X31 // SBCS X14, X10, X10
SUB X28, X10 // ...
SLTU X14, X10, X28 // ...
SUB X14, X10 // ...
ADD X31, X28 // ...
SLTU X28, X11, X31 // SBCS X15, X11, X11
SUB X28, X11 // ...
SLTU X15, X11, X28 // ...
SUB X15, X11 // ...
ADD X31, X28 // ...
SLTU X28, X12, X31 // SBCS X16, X12, X12
SUB X28, X12 // ...
SLTU X16, X12, X28 // ...
SUB X16, X12 // ...
ADD X31, X28 // ...
MOV X9, 0(X8)
MOV X10, 8(X8)
MOV X11, 16(X8)
MOV X12, 24(X8)
ADD $32, X6
ADD $32, X7
ADD $32, X8
SUB $1, X5
BNEZ X5, loop4cont
loop4done:
MOV X28, c+72(FP)
RET
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)
TEXT ·rshVU(SB),NOSPLIT,$0
JMP ·rshVU_g(SB)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOV x+24(FP), X5
MOV m+48(FP), X6
MOV z+0(FP), X7
MOV z_len+8(FP), X30
MOV a+56(FP), X29
MOV $4, X28
BEQ ZERO, X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 8(X5), X11 // x[1]
MOV 16(X5), X14 // x[2]
MOV 24(X5), X17 // x[3]
MULHU X8, X6, X9 // z_hi[0] = x[0] * m
MUL X8, X6, X8 // z_lo[0] = x[0] * m
ADD X8, X29, X10 // z[0] = z_lo[0] + c
SLTU X8, X10, X23
ADD X23, X9, X29 // next c
MULHU X11, X6, X12 // z_hi[1] = x[1] * m
MUL X11, X6, X11 // z_lo[1] = x[1] * m
ADD X11, X29, X13 // z[1] = z_lo[1] + c
SLTU X11, X13, X23
ADD X23, X12, X29 // next c
MULHU X14, X6, X15 // z_hi[2] = x[2] * m
MUL X14, X6, X14 // z_lo[2] = x[2] * m
ADD X14, X29, X16 // z[2] = z_lo[2] + c
SLTU X14, X16, X23
ADD X23, X15, X29 // next c
MULHU X17, X6, X18 // z_hi[3] = x[3] * m
MUL X17, X6, X17 // z_lo[3] = x[3] * m
ADD X17, X29, X19 // z[3] = z_lo[3] + c
SLTU X17, X19, X23
ADD X23, X18, X29 // next c
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOV z_len+8(FP), X5
BEQZ X5, ret0
MOV s+48(FP), X6
MOV x_base+24(FP), X7
MOV z_base+0(FP), X8
// run loop backward
SLL $3, X5, X9
ADD X9, X7
SLL $3, X5, X9
ADD X9, X8
// shift first word into carry
MOV -8(X7), X9
MOV $64, X10
SUB X6, X10
SRL X10, X9, X11
SLL X6, X9
MOV X11, c+56(FP)
// shift remaining words
SUB $1, X5
// compute unrolled loop lengths
AND $3, X5, X11
SRL $2, X5
loop1:
MOV 0(X5), X10 // x
MULHU X10, X6, X12 // z_hi = x * m
MUL X10, X6, X10 // z_lo = x * m
ADD X10, X29, X13 // z_lo + c
SLTU X10, X13, X15
ADD X12, X15, X29 // next c
MOV X13, 0(X7) // z
ADD $8, X5
ADD $8, X7
SUB $1, X30
BNEZ X30, loop1
done:
MOV X29, c+64(FP) // return c
BEQZ X11, loop1done
loop1cont:
// unroll 1X
MOV -16(X7), X12
SRL X10, X12, X13
OR X9, X13
SLL X6, X12, X9
MOV X13, -8(X8)
ADD $-8, X7
ADD $-8, X8
SUB $1, X11
BNEZ X11, loop1cont
loop1done:
loop4:
BEQZ X5, loop4done
loop4cont:
// unroll 4X
MOV -16(X7), X11
MOV -24(X7), X12
MOV -32(X7), X13
MOV -40(X7), X14
SRL X10, X11, X15
OR X9, X15
SLL X6, X11, X9
SRL X10, X12, X11
OR X9, X11
SLL X6, X12, X9
SRL X10, X13, X12
OR X9, X12
SLL X6, X13, X9
SRL X10, X14, X13
OR X9, X13
SLL X6, X14, X9
MOV X15, -8(X8)
MOV X11, -16(X8)
MOV X12, -24(X8)
MOV X13, -32(X8)
ADD $-32, X7
ADD $-32, X8
SUB $1, X5
BNEZ X5, loop4cont
loop4done:
// store final shifted bits
MOV X9, -8(X8)
RET
ret0:
MOV X0, c+56(FP)
RET
TEXT ·addMulVVWW(SB),NOSPLIT,$0
MOV y+48(FP), X5
MOV m+72(FP), X6
MOV x+24(FP), X7
MOV z+0(FP), X20
MOV z_len+8(FP), X30
MOV $4, X28
MOV a+80(FP), X29 // c = a
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // y[0]
MOV 0(X7), X10 // x[0]
MOV 8(X5), X11 // y[1]
MOV 8(X7), X13 // x[1]
MOV 16(X5), X14 // y[2]
MOV 16(X7), X16 // x[2]
MOV 24(X5), X17 // y[3]
MOV 24(X7), X19 // x[3]
MULHU X8, X6, X9 // x_hi[0] = y[0] * m
MUL X8, X6, X8 // x_lo[0] = y[0] * m
ADD X8, X10, X21 // x_lo[0] = y[0] * m + x[0]
SLTU X8, X21, X22
ADD X9, X22, X9 // x_hi[0] = y[0] * m + x[0]
ADD X21, X29, X10 // x[0] = y[0] * m + x[0] + c
SLTU X21, X10, X22
ADD X9, X22, X29 // next c
MULHU X11, X6, X12 // x_hi[1] = y[1] * m
MUL X11, X6, X11 // x_lo[1] = y[1] * m
ADD X11, X13, X21 // x_lo[1] = y[1] * m + x[1]
SLTU X11, X21, X22
ADD X12, X22, X12 // x_hi[1] = y[1] * m + x[1]
ADD X21, X29, X13 // x[1] = y[1] * m + x[1] + c
SLTU X21, X13, X22
ADD X12, X22, X29 // next c
MULHU X14, X6, X15 // x_hi[2] = y[2] * m
MUL X14, X6, X14 // x_lo[2] = y[2] * m
ADD X14, X16, X21 // x_lo[2] = y[2] * m + x[2]
SLTU X14, X21, X22
ADD X15, X22, X15 // x_hi[2] = y[2] * m + x[2]
ADD X21, X29, X16 // x[2] = y[2] * m + x[2] + c
SLTU X21, X16, X22
ADD X15, X22, X29 // next c
MULHU X17, X6, X18 // x_hi[3] = y[3] * m
MUL X17, X6, X17 // x_lo[3] = y[3] * m
ADD X17, X19, X21 // x_lo[3] = y[3] * m + x[3]
SLTU X17, X21, X22
ADD X18, X22, X18 // x_hi[3] = y[3] * m + x[3]
ADD X21, X29, X19 // x[3] = y[3] * m + x[3] + c
SLTU X21, X19, X22
ADD X18, X22, X29 // next c
MOV X10, 0(X20) // z[0]
MOV X13, 8(X20) // z[1]
MOV X16, 16(X20) // z[2]
MOV X19, 24(X20) // z[3]
ADD $32, X5
ADD $32, X7
ADD $32, X20
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOV z_len+8(FP), X5
BEQZ X5, ret0
MOV s+48(FP), X6
MOV x_base+24(FP), X7
MOV z_base+0(FP), X8
// shift first word into carry
MOV 0(X7), X9
MOV $64, X10
SUB X6, X10
SLL X10, X9, X11
SRL X6, X9
MOV X11, c+56(FP)
// shift remaining words
SUB $1, X5
// compute unrolled loop lengths
AND $3, X5, X11
SRL $2, X5
loop1:
MOV 0(X5), X10 // y
MOV 0(X7), X11 // x
MULHU X10, X6, X12 // z_hi = y * m
MUL X10, X6, X10 // z_lo = y * m
ADD X10, X11, X13 // z_lo = y * m + x
SLTU X10, X13, X15
ADD X12, X15, X12 // z_hi = y * m + x
ADD X13, X29, X10 // z = y * m + x + c
SLTU X13, X10, X15
ADD X12, X15, X29 // next c
MOV X10, 0(X20) // z
ADD $8, X5
ADD $8, X7
ADD $8, X20
SUB $1, X30
BNEZ X30, loop1
done:
MOV X29, c+88(FP) // return c
BEQZ X11, loop1done
loop1cont:
// unroll 1X
MOV 8(X7), X12
SLL X10, X12, X13
OR X9, X13
SRL X6, X12, X9
MOV X13, 0(X8)
ADD $8, X7
ADD $8, X8
SUB $1, X11
BNEZ X11, loop1cont
loop1done:
loop4:
BEQZ X5, loop4done
loop4cont:
// unroll 4X
MOV 8(X7), X11
MOV 16(X7), X12
MOV 24(X7), X13
MOV 32(X7), X14
SLL X10, X11, X15
OR X9, X15
SRL X6, X11, X9
SLL X10, X12, X11
OR X9, X11
SRL X6, X12, X9
SLL X10, X13, X12
OR X9, X12
SRL X6, X13, X9
SLL X10, X14, X13
OR X9, X13
SRL X6, X14, X9
MOV X15, 0(X8)
MOV X11, 8(X8)
MOV X12, 16(X8)
MOV X13, 24(X8)
ADD $32, X7
ADD $32, X8
SUB $1, X5
BNEZ X5, loop4cont
loop4done:
// store final shifted bits
MOV X9, 0(X8)
RET
ret0:
MOV X0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOV m+48(FP), X5
MOV a+56(FP), X6
MOV z_len+8(FP), X7
MOV x_base+24(FP), X8
MOV z_base+0(FP), X9
// compute unrolled loop lengths
AND $3, X7, X10
SRL $2, X7
loop1:
BEQZ X10, loop1done
loop1cont:
// unroll 1X
MOV 0(X8), X11
// synthetic carry, one column at a time
MUL X5, X11, X12
MULHU X5, X11, X13
ADD X6, X12, X11 // ADDS X6, X12, X11 (cr=X28)
SLTU X6, X11, X28 // ...
ADD X28, X13, X6 // ADC $0, X13, X6
MOV X11, 0(X9)
ADD $8, X8
ADD $8, X9
SUB $1, X10
BNEZ X10, loop1cont
loop1done:
loop4:
BEQZ X7, loop4done
loop4cont:
// unroll 4X
MOV 0(X8), X10
MOV 8(X8), X11
MOV 16(X8), X12
MOV 24(X8), X13
// synthetic carry, one column at a time
MUL X5, X10, X14
MULHU X5, X10, X15
ADD X6, X14, X10 // ADDS X6, X14, X10 (cr=X28)
SLTU X6, X10, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MUL X5, X11, X14
MULHU X5, X11, X15
ADD X6, X14, X11 // ADDS X6, X14, X11 (cr=X28)
SLTU X6, X11, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MUL X5, X12, X14
MULHU X5, X12, X15
ADD X6, X14, X12 // ADDS X6, X14, X12 (cr=X28)
SLTU X6, X12, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MUL X5, X13, X14
MULHU X5, X13, X15
ADD X6, X14, X13 // ADDS X6, X14, X13 (cr=X28)
SLTU X6, X13, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MOV X10, 0(X9)
MOV X11, 8(X9)
MOV X12, 16(X9)
MOV X13, 24(X9)
ADD $32, X8
ADD $32, X9
SUB $1, X7
BNEZ X7, loop4cont
loop4done:
MOV X6, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOV m+72(FP), X5
MOV a+80(FP), X6
MOV z_len+8(FP), X7
MOV x_base+24(FP), X8
MOV y_base+48(FP), X9
MOV z_base+0(FP), X10
// compute unrolled loop lengths
AND $3, X7, X11
SRL $2, X7
loop1:
BEQZ X11, loop1done
loop1cont:
// unroll 1X
MOV 0(X8), X12
MOV 0(X9), X13
// synthetic carry, one column at a time
MUL X5, X13, X14
MULHU X5, X13, X15
ADD X12, X14 // ADDS X12, X14, X14 (cr=X28)
SLTU X12, X14, X28 // ...
ADD X28, X15 // ADC $0, X15, X15
ADD X6, X14, X13 // ADDS X6, X14, X13 (cr=X28)
SLTU X6, X13, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MOV X13, 0(X10)
ADD $8, X8
ADD $8, X9
ADD $8, X10
SUB $1, X11
BNEZ X11, loop1cont
loop1done:
loop4:
BEQZ X7, loop4done
loop4cont:
// unroll 4X
MOV 0(X8), X11
MOV 8(X8), X12
MOV 16(X8), X13
MOV 24(X8), X14
MOV 0(X9), X15
MOV 8(X9), X16
MOV 16(X9), X17
MOV 24(X9), X18
// synthetic carry, one column at a time
MUL X5, X15, X19
MULHU X5, X15, X20
ADD X11, X19 // ADDS X11, X19, X19 (cr=X28)
SLTU X11, X19, X28 // ...
ADD X28, X20 // ADC $0, X20, X20
ADD X6, X19, X15 // ADDS X6, X19, X15 (cr=X28)
SLTU X6, X15, X28 // ...
ADD X28, X20, X6 // ADC $0, X20, X6
MUL X5, X16, X19
MULHU X5, X16, X20
ADD X12, X19 // ADDS X12, X19, X19 (cr=X28)
SLTU X12, X19, X28 // ...
ADD X28, X20 // ADC $0, X20, X20
ADD X6, X19, X16 // ADDS X6, X19, X16 (cr=X28)
SLTU X6, X16, X28 // ...
ADD X28, X20, X6 // ADC $0, X20, X6
MUL X5, X17, X19
MULHU X5, X17, X20
ADD X13, X19 // ADDS X13, X19, X19 (cr=X28)
SLTU X13, X19, X28 // ...
ADD X28, X20 // ADC $0, X20, X20
ADD X6, X19, X17 // ADDS X6, X19, X17 (cr=X28)
SLTU X6, X17, X28 // ...
ADD X28, X20, X6 // ADC $0, X20, X6
MUL X5, X18, X19
MULHU X5, X18, X20
ADD X14, X19 // ADDS X14, X19, X19 (cr=X28)
SLTU X14, X19, X28 // ...
ADD X28, X20 // ADC $0, X20, X20
ADD X6, X19, X18 // ADDS X6, X19, X18 (cr=X28)
SLTU X6, X18, X28 // ...
ADD X28, X20, X6 // ADC $0, X20, X6
MOV X15, 0(X10)
MOV X16, 8(X10)
MOV X17, 16(X10)
MOV X18, 24(X10)
ADD $32, X8
ADD $32, X9
ADD $32, X10
SUB $1, X7
BNEZ X7, loop4cont
loop4done:
MOV X6, c+88(FP)
RET

File diff suppressed because it is too large Load Diff

View File

@ -2,14 +2,18 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build s390x && !math_big_pure_go
//go:build !math_big_pure_go
package big
import "testing"
func TestNoVec(t *testing.T) {
// Make sure non-vector versions match vector versions.
t.Run("AddVV", func(t *testing.T) { testVV(t, "addVV_novec", addVV_novec, addVV) })
t.Run("SubVV", func(t *testing.T) { testVV(t, "subVV_novec", subVV_novec, subVV) })
func TestAddVVNoVec(t *testing.T) {
setDuringTest(t, &hasVX, false)
TestAddVV(t)
}
func TestSubVVNoVec(t *testing.T) {
setDuringTest(t, &hasVX, false)
TestSubVV(t)
}

View File

@ -8,11 +8,7 @@ package big
import "internal/cpu"
func addVV_check(z, x, y []Word) (c Word)
func addVV_vec(z, x, y []Word) (c Word)
func addVV_novec(z, x, y []Word) (c Word)
func subVV_check(z, x, y []Word) (c Word)
func subVV_vec(z, x, y []Word) (c Word)
func subVV_novec(z, x, y []Word) (c Word)
var hasVX = cpu.S390X.HasVX
func addVVvec(z, x, y []Word) (c Word)
func subVVvec(z, x, y []Word) (c Word)

View File

@ -0,0 +1,310 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !math_big_pure_go
#include "textflag.h"
TEXT ·addVVvec(SB), NOSPLIT, $0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R2
MOVD $0, R4 // c = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3
BLT v1
SUB $12, R3 // n -= 16
BLT A1 // if n < 0 goto A1
MOVD R8, R5
MOVD R9, R6
MOVD R2, R7
// n >= 0
// regular loop body unrolled 16x
VZERO V0 // c = 0
UU1:
VLM 0(R5), V1, V4 // 64-bytes into V1..V8
ADD $64, R5
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
ADD $64, R6
VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
VACCCQ V1, V9, V0, V25
VACQ V1, V9, V0, V17
VACCCQ V2, V10, V25, V26
VACQ V2, V10, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
VACCCQ V3, V11, V26, V27
VACQ V3, V11, V26, V19
VACCCQ V4, V12, V27, V28
VACQ V4, V12, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
VACCCQ V5, V13, V28, V29
VACQ V5, V13, V28, V21
VACCCQ V6, V14, V29, V30
VACQ V6, V14, V29, V22
VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
VACCCQ V7, V15, V30, V31
VACQ V7, V15, V30, V23
VACCCQ V8, V16, V31, V0 // V0 has carry-over
VACQ V8, V16, V31, V24
VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4
NEG R4, R4 // save cf
A1:
ADD $12, R3 // n += 16
// s/JL/JMP/ below to disable the unrolled loop
BLT v1 // if n < 0 goto v1
U1: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
ADDC R4, R4 // restore CF
MOVD 0(R9)(R10*1), R11
ADDE R11, R5
MOVD 8(R9)(R10*1), R11
ADDE R11, R6
MOVD 16(R9)(R10*1), R11
ADDE R11, R7
MOVD 24(R9)(R10*1), R11
ADDE R11, R1
MOVD R0, R4
ADDE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4
SUB $4, R3 // n -= 4
BGE U1 // if n >= 0 goto U1
v1:
ADD $4, R3 // n += 4
BLE E1 // if n <= 0 goto E1
L1: // n > 0
ADDC R4, R4 // restore CF
MOVD 0(R8)(R10*1), R5
MOVD 0(R9)(R10*1), R11
ADDE R11, R5
MOVD R5, 0(R2)(R10*1)
MOVD R0, R4
ADDE R4, R4 // save CF
NEG R4, R4
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L1 // if n > 0 goto L1
E1:
NEG R4, R4
MOVD R4, c+72(FP) // return c
RET
TEXT ·subVVvec(SB), NOSPLIT, $0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R2
MOVD $0, R4 // c = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v1 // if n < 0 goto v1
SUB $12, R3 // n -= 16
BLT A1 // if n < 0 goto A1
MOVD R8, R5
MOVD R9, R6
MOVD R2, R7
// n >= 0
// regular loop body unrolled 16x
VZERO V0 // cf = 0
MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
VLVGG $1, R4, V0 // put carry into V0
UU1:
VLM 0(R5), V1, V4 // 64-bytes into V1..V8
ADD $64, R5
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
ADD $64, R6
VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
VSBCBIQ V1, V9, V0, V25
VSBIQ V1, V9, V0, V17
VSBCBIQ V2, V10, V25, V26
VSBIQ V2, V10, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
VSBCBIQ V3, V11, V26, V27
VSBIQ V3, V11, V26, V19
VSBCBIQ V4, V12, V27, V28
VSBIQ V4, V12, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
VSBCBIQ V5, V13, V28, V29
VSBIQ V5, V13, V28, V21
VSBCBIQ V6, V14, V29, V30
VSBIQ V6, V14, V29, V22
VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
VSBCBIQ V7, V15, V30, V31
VSBIQ V7, V15, V30, V23
VSBCBIQ V8, V16, V31, V0 // V0 has carry-over
VSBIQ V8, V16, V31, V24
VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4
SUB $1, R4 // save cf
A1:
ADD $12, R3 // n += 16
BLT v1 // if n < 0 goto v1
U1: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
MOVD R0, R11
SUBC R4, R11 // restore CF
MOVD 0(R9)(R10*1), R11
SUBE R11, R5
MOVD 8(R9)(R10*1), R11
SUBE R11, R6
MOVD 16(R9)(R10*1), R11
SUBE R11, R7
MOVD 24(R9)(R10*1), R11
SUBE R11, R1
MOVD R0, R4
SUBE R4, R4 // save CF
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4
SUB $4, R3 // n -= 4
BGE U1 // if n >= 0 goto U1n
v1:
ADD $4, R3 // n += 4
BLE E1 // if n <= 0 goto E1
L1: // n > 0
MOVD R0, R11
SUBC R4, R11 // restore CF
MOVD 0(R8)(R10*1), R5
MOVD 0(R9)(R10*1), R11
SUBE R11, R5
MOVD R5, 0(R2)(R10*1)
MOVD R0, R4
SUBE R4, R4 // save CF
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L1 // if n > 0 goto L1n
E1:
NEG R4, R4
MOVD R4, c+72(FP) // return c
RET

View File

@ -15,7 +15,6 @@ import (
var generateFlag = flag.Bool("generate", false, "generate files")
func Test(t *testing.T) {
t.Skip("assembly not yet installed")
for _, arch := range arches {
t.Run(arch.Name, func(t *testing.T) {
file, data := generate(arch)