math/big: replace assembly with mini-compiler output

Step 4 of the mini-compiler: switch to the new generated assembly.
No systematic performance regressions, and many many improvements.

In the benchmarks, the systems are:

	c3h88     GOARCH=amd64     c3h88 perf gomote (newer Intel, Google Cloud)
	c2s16     GOARCH=amd64     c2s16 perf gomote (Intel, Google Cloud)
	s7        GOARCH=amd64     rsc basement server (AMD Ryzen 9 7950X)
	386       GOARCH=386       gotip-linux-386 gomote (Intel, Google Cloud)
	s7-386    GOARCH=386       rsc basement server (AMD Ryzen 9 7950X)
	c4as16    GOARCH=arm64     c4as16 perf gomote (Google Cloud)
	mac       GOARCH=arm64     Apple M3 Pro in MacBook Pro
	arm       GOARCH=arm       gotip-linux-arm gomote
	loong64   GOARCH=loong64   gotip-linux-loong64 gomote
	ppc64le   GOARCH=ppc64le   gotip-linux-ppc64le gomote
	riscv64   GOARCH=riscv64   gotip-linux-riscv64 gomote
	s390x     GOARCH=s390x     linux-s390x-ibm old gomote

benchmark \ system           c3h88    c2s16       s7      386   s7-386   c4as16      mac      arm  loong64  ppc64le  riscv64    s390x
AddVV/words=1               -4.03%   +5.21%   -4.04%   +4.94%        ~        ~        ~        ~  -19.51%        ~        ~        ~
AddVV/words=10             -10.20%   +0.34%   -3.46%  -11.50%   -7.46%   +7.66%   +5.97%        ~  -17.90%        ~        ~        ~
AddVV/words=16             -10.91%   -6.45%   -8.45%  -21.86%  -17.90%   +2.73%   -1.61%        ~  -22.47%   -3.54%        ~        ~
AddVV/words=100             -3.77%   -4.30%   -3.17%  -47.27%  -45.34%   -0.78%        ~   -8.74%  -27.19%        ~        ~        ~
AddVV/words=1000            -0.08%   -0.71%        ~  -49.21%  -48.07%        ~        ~  -16.80%  -24.74%        ~        ~        ~
AddVV/words=10000                ~        ~        ~  -48.73%  -48.56%   -0.06%        ~  -17.08%        ~        ~   -4.81%        ~
AddVV/words=100000               ~        ~        ~  -47.80%  -48.38%        ~        ~  -15.10%  -25.06%        ~   -5.34%        ~
SubVV/words=1               -0.84%   +3.43%   -3.62%   +1.34%        ~   -0.76%        ~        ~  -18.18%   +5.58%        ~        ~
SubVV/words=10              -9.99%   +0.34%        ~  -11.23%   -8.24%   +7.53%   +6.15%        ~  -17.55%   +2.77%   -2.08%        ~
SubVV/words=16             -11.94%   -6.45%   -6.81%  -21.82%  -18.11%   +1.58%   -1.21%        ~  -20.36%        ~        ~        ~
SubVV/words=100             -3.38%   -4.32%   -1.80%  -46.14%  -46.43%   +0.41%        ~   -7.20%  -26.17%        ~   -0.42%        ~
SubVV/words=1000            -0.38%   -0.80%        ~  -49.22%  -48.90%        ~        ~  -15.86%  -24.73%        ~        ~        ~
SubVV/words=10000                ~        ~        ~  -49.57%  -49.64%   -0.03%        ~  -15.85%  -26.52%        ~   -5.05%        ~
SubVV/words=100000               ~        ~        ~  -46.88%  -49.66%        ~        ~  -15.45%  -16.11%        ~   -4.99%        ~
LshVU/words=1                    ~   +5.78%        ~        ~   -2.48%   +1.61%   +2.18%   +2.70%  -18.16%  -34.16%  -21.29%        ~
LshVU/words=10             -18.34%   -3.78%   +2.21%        ~        ~   -2.81%  -12.54%        ~  -25.02%  -24.78%  -38.11%  -66.98%
LshVU/words=16             -23.15%   +1.03%   +7.74%   +0.73%        ~   +8.88%   +1.56%        ~  -25.37%  -28.46%  -41.27%        ~
LshVU/words=100            -32.85%   -8.86%   -2.58%        ~   +2.69%   +1.24%        ~  -20.63%  -44.14%  -42.68%  -53.09%        ~
LshVU/words=1000           -37.30%   -0.20%   +5.67%        ~        ~   +1.44%        ~  -27.83%  -45.01%  -37.07%  -57.02%  -46.57%
LshVU/words=10000          -36.84%   -2.30%   +3.82%        ~   +1.86%   +1.57%  -66.81%  -28.00%  -13.15%  -35.40%  -41.97%        ~
LshVU/words=100000         -40.30%        ~   +3.96%        ~        ~        ~        ~  -24.91%  -19.06%  -36.14%  -40.99%  -66.03%
RshVU/words=1               -3.17%   +4.76%   -4.06%   +4.31%   +4.55%        ~        ~        ~  -20.61%        ~  -26.20%  -51.33%
RshVU/words=10             -22.08%   -4.41%  -17.99%   +3.64%  -11.87%        ~  -16.30%        ~  -30.01%        ~  -40.37%  -63.05%
RshVU/words=16             -26.03%   -8.50%  -18.09%        ~  -17.52%   +6.50%        ~   -2.85%  -30.24%        ~  -42.93%  -63.13%
RshVU/words=100            -20.87%  -28.83%  -29.45%        ~  -26.25%   +1.46%   -1.14%  -16.20%  -45.65%  -16.20%  -53.66%  -77.27%
RshVU/words=1000           -24.03%  -21.37%  -26.71%        ~  -28.95%   +0.98%        ~  -18.82%  -45.21%  -23.55%  -57.09%  -71.18%
RshVU/words=10000          -24.56%  -22.44%  -27.01%        ~  -28.88%   +0.78%   -5.35%  -17.47%  -16.87%  -20.67%  -41.97%        ~
RshVU/words=100000         -23.36%  -15.65%  -27.54%        ~  -29.26%   +1.73%   -6.67%  -13.68%  -21.40%  -23.02%  -40.37%  -66.31%
MulAddVWW/words=1           +2.37%   +8.14%        ~   +4.10%   +3.71%        ~        ~        ~  -21.62%        ~   +1.12%        ~
MulAddVWW/words=10               ~   -2.72%  -15.15%   +8.04%        ~        ~        ~   -2.52%  -19.48%        ~   -6.18%        ~
MulAddVWW/words=16               ~   +1.49%        ~   +4.49%   +6.58%   -8.70%   -7.16%  -12.08%  -21.43%   -6.59%   -9.05%        ~
MulAddVWW/words=100         +0.37%   +1.11%   -4.51%  -13.59%        ~  -11.10%   -3.63%  -21.40%  -22.27%   -2.92%  -14.41%        ~
MulAddVWW/words=1000             ~   +0.90%   -7.13%  -18.94%        ~  -14.02%   -9.97%  -28.31%  -18.72%   -2.32%  -15.80%        ~
MulAddVWW/words=10000            ~   +1.08%   -6.75%  -19.10%        ~  -14.61%   -9.04%  -28.48%  -14.29%   -2.25%   -9.40%        ~
MulAddVWW/words=100000           ~        ~   -6.93%  -18.09%        ~  -14.33%   -9.66%  -28.92%  -16.63%   -2.43%   -8.23%        ~
AddMulVVWW/words=1          +2.30%   +4.83%  -11.37%   +4.58%        ~   -3.14%        ~        ~  -10.58%  +30.35%        ~        ~
AddMulVVWW/words=10         -3.27%        ~   +8.96%   +5.74%        ~   +2.67%   -1.44%   -7.64%  -13.41%        ~        ~        ~
AddMulVVWW/words=16         -6.12%        ~        ~        ~   +1.91%   -7.90%  -16.22%  -14.07%  -14.26%   -4.15%   -7.30%        ~
AddMulVVWW/words=100        -5.48%   -2.14%        ~   -9.40%   +9.98%   -1.43%  -12.35%  -18.56%  -21.94%        ~   -9.84%        ~
AddMulVVWW/words=1000      -11.35%   -3.40%   -3.64%  -11.04%  +12.82%   -1.33%  -15.63%  -20.50%  -20.95%        ~  -11.06%  -51.97%
AddMulVVWW/words=10000     -10.31%   -1.61%   -8.41%  -12.15%  +13.10%   -1.03%  -16.34%  -22.46%   -1.00%        ~  -10.33%  -49.80%
AddMulVVWW/words=100000    -13.71%        ~   -8.31%  -12.18%  +12.98%   -1.35%  -15.20%  -21.89%        ~        ~   -9.38%  -48.30%

Change-Id: I0a33c33602c0d053c84d9946e662500cfa048e2d
Reviewed-on: https://go-review.googlesource.com/c/go/+/664938
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Russ Cox 2025-04-10 17:01:24 -04:00 committed by Gopher Robot
parent 39070da4f8
commit 7f516a31b0
17 changed files with 4200 additions and 2523 deletions

View File

@ -1,192 +1,240 @@
// Copyright 2009 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go //go:build !math_big_pure_go
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func addVV(z, x, y []Word) (c Word) // func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
MOVL z+0(FP), DI MOVL z_len+4(FP), BX
MOVL x+12(FP), SI MOVL x_base+12(FP), SI
MOVL y+24(FP), CX MOVL y_base+24(FP), DI
MOVL z_len+4(FP), BP MOVL z_base+0(FP), BP
MOVL $0, BX // i = 0 // compute unrolled loop lengths
MOVL $0, DX // c = 0 MOVL BX, CX
JMP E1 ANDL $3, CX
SHRL $2, BX
L1: MOVL (SI)(BX*4), AX MOVL $0, DX // clear saved carry
ADDL DX, DX // restore CF loop1:
ADCL (CX)(BX*4), AX TESTL CX, CX; JZ loop1done
SBBL DX, DX // save CF loop1cont:
MOVL AX, (DI)(BX*4) // unroll 1X in batches of 1
ADDL $1, BX // i++ ADDL DX, DX // restore carry
MOVL 0(SI), DX
E1: CMPL BX, BP // i < n ADCL 0(DI), DX
JL L1 MOVL DX, 0(BP)
SBBL DX, DX // save carry
NEGL DX LEAL 4(SI), SI // ADD $4, SI
LEAL 4(DI), DI // ADD $4, DI
LEAL 4(BP), BP // ADD $4, BP
SUBL $1, CX; JNZ loop1cont
loop1done:
loop4:
TESTL BX, BX; JZ loop4done
loop4cont:
// unroll 4X in batches of 1
ADDL DX, DX // restore carry
MOVL 0(SI), CX
ADCL 0(DI), CX
MOVL CX, 0(BP)
MOVL 4(SI), CX
ADCL 4(DI), CX
MOVL CX, 4(BP)
MOVL 8(SI), CX
ADCL 8(DI), CX
MOVL CX, 8(BP)
MOVL 12(SI), CX
ADCL 12(DI), CX
MOVL CX, 12(BP)
SBBL DX, DX // save carry
LEAL 16(SI), SI // ADD $16, SI
LEAL 16(DI), DI // ADD $16, DI
LEAL 16(BP), BP // ADD $16, BP
SUBL $1, BX; JNZ loop4cont
loop4done:
NEGL DX // convert add carry
MOVL DX, c+36(FP) MOVL DX, c+36(FP)
RET RET
// func subVV(z, x, y []Word) (c Word) // func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SBBL instead of ADCL and label names)
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
MOVL z+0(FP), DI MOVL z_len+4(FP), BX
MOVL x+12(FP), SI MOVL x_base+12(FP), SI
MOVL y+24(FP), CX MOVL y_base+24(FP), DI
MOVL z_len+4(FP), BP MOVL z_base+0(FP), BP
MOVL $0, BX // i = 0 // compute unrolled loop lengths
MOVL $0, DX // c = 0 MOVL BX, CX
JMP E2 ANDL $3, CX
SHRL $2, BX
L2: MOVL (SI)(BX*4), AX MOVL $0, DX // clear saved carry
ADDL DX, DX // restore CF loop1:
SBBL (CX)(BX*4), AX TESTL CX, CX; JZ loop1done
SBBL DX, DX // save CF loop1cont:
MOVL AX, (DI)(BX*4) // unroll 1X in batches of 1
ADDL $1, BX // i++ ADDL DX, DX // restore carry
MOVL 0(SI), DX
E2: CMPL BX, BP // i < n SBBL 0(DI), DX
JL L2 MOVL DX, 0(BP)
SBBL DX, DX // save carry
NEGL DX LEAL 4(SI), SI // ADD $4, SI
LEAL 4(DI), DI // ADD $4, DI
LEAL 4(BP), BP // ADD $4, BP
SUBL $1, CX; JNZ loop1cont
loop1done:
loop4:
TESTL BX, BX; JZ loop4done
loop4cont:
// unroll 4X in batches of 1
ADDL DX, DX // restore carry
MOVL 0(SI), CX
SBBL 0(DI), CX
MOVL CX, 0(BP)
MOVL 4(SI), CX
SBBL 4(DI), CX
MOVL CX, 4(BP)
MOVL 8(SI), CX
SBBL 8(DI), CX
MOVL CX, 8(BP)
MOVL 12(SI), CX
SBBL 12(DI), CX
MOVL CX, 12(BP)
SBBL DX, DX // save carry
LEAL 16(SI), SI // ADD $16, SI
LEAL 16(DI), DI // ADD $16, DI
LEAL 16(BP), BP // ADD $16, BP
SUBL $1, BX; JNZ loop4cont
loop4done:
NEGL DX // convert sub carry
MOVL DX, c+36(FP) MOVL DX, c+36(FP)
RET RET
// func lshVU(z, x []Word, s uint) (c Word) // func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
MOVL z_len+4(FP), BX // i = z MOVL z_len+4(FP), BX
SUBL $1, BX // i-- TESTL BX, BX; JZ ret0
JL X8b // i < 0 (n <= 0)
// n > 0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL s+24(FP), CX MOVL s+24(FP), CX
MOVL (SI)(BX*4), AX // w1 = x[n-1] MOVL x_base+12(FP), SI
MOVL z_base+0(FP), DI
// run loop backward, using counter as positive index
// shift first word into carry
MOVL -4(SI)(BX*4), BP
MOVL $0, DX MOVL $0, DX
SHLL CX, AX, DX // w1>>ŝ SHLL CX, BP, DX
MOVL DX, c+28(FP) MOVL DX, c+28(FP)
// shift remaining words
CMPL BX, $0 SUBL $1, BX
JLE X8a // i <= 0 loop1:
TESTL BX, BX; JZ loop1done
// i > 0 loop1cont:
L8: MOVL AX, DX // w = w1 // unroll 1X in batches of 1
MOVL -4(SI)(BX*4), AX // w1 = x[i-1] MOVL -4(SI)(BX*4), DX
SHLL CX, AX, DX // w<<s | w1>>ŝ SHLL CX, DX, BP
MOVL DX, (DI)(BX*4) // z[i] = w<<s | w1>>ŝ MOVL BP, 0(DI)(BX*4)
SUBL $1, BX // i-- MOVL DX, BP
JG L8 // i > 0 SUBL $1, BX; JNZ loop1cont
loop1done:
// i <= 0 // store final shifted bits
X8a: SHLL CX, AX // w1<<s SHLL CX, BP
MOVL AX, (DI) // z[0] = w1<<s MOVL BP, 0(DI)(BX*4)
RET RET
ret0:
X8b: MOVL $0, c+28(FP) MOVL $0, c+28(FP)
RET RET
// func rshVU(z, x []Word, s uint) (c Word) // func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
MOVL z_len+4(FP), BP MOVL z_len+4(FP), BX
SUBL $1, BP // n-- TESTL BX, BX; JZ ret0
JL X9b // n < 0 (n <= 0)
// n > 0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL s+24(FP), CX MOVL s+24(FP), CX
MOVL (SI), AX // w1 = x[0] MOVL x_base+12(FP), SI
MOVL z_base+0(FP), DI
// use counter as negative index
LEAL (SI)(BX*4), SI
LEAL (DI)(BX*4), DI
NEGL BX
// shift first word into carry
MOVL 0(SI)(BX*4), BP
MOVL $0, DX MOVL $0, DX
SHRL CX, AX, DX // w1<<ŝ SHRL CX, BP, DX
MOVL DX, c+28(FP) MOVL DX, c+28(FP)
// shift remaining words
MOVL $0, BX // i = 0 ADDL $1, BX
JMP E9 loop1:
TESTL BX, BX; JZ loop1done
// i < n-1 loop1cont:
L9: MOVL AX, DX // w = w1 // unroll 1X in batches of 1
MOVL 4(SI)(BX*4), AX // w1 = x[i+1] MOVL 0(SI)(BX*4), DX
SHRL CX, AX, DX // w>>s | w1<<ŝ SHRL CX, DX, BP
MOVL DX, (DI)(BX*4) // z[i] = w>>s | w1<<ŝ MOVL BP, -4(DI)(BX*4)
ADDL $1, BX // i++ MOVL DX, BP
ADDL $1, BX; JNZ loop1cont
E9: CMPL BX, BP loop1done:
JL L9 // i < n-1 // store final shifted bits
SHRL CX, BP
// i >= n-1 MOVL BP, -4(DI)(BX*4)
X9a: SHRL CX, AX // w1>>s
MOVL AX, (DI)(BP*4) // z[n-1] = w1>>s
RET RET
ret0:
X9b: MOVL $0, c+28(FP) MOVL $0, c+28(FP)
RET RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word) // func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVL z+0(FP), DI MOVL m+24(FP), BX
MOVL x+12(FP), SI MOVL a+28(FP), SI
MOVL m+24(FP), BP MOVL z_len+4(FP), DI
MOVL a+28(FP), CX // c = a MOVL x_base+12(FP), BP
MOVL z_len+4(FP), BX MOVL z_base+0(FP), CX
LEAL (DI)(BX*4), DI // use counter as negative index
LEAL (SI)(BX*4), SI LEAL (BP)(DI*4), BP
NEGL BX // i = -n LEAL (CX)(DI*4), CX
JMP E5 NEGL DI
loop1:
L5: MOVL (SI)(BX*4), AX TESTL DI, DI; JZ loop1done
MULL BP loop1cont:
ADDL CX, AX // unroll 1X in batches of 1
ADCL $0, DX MOVL 0(BP)(DI*4), AX
MOVL AX, (DI)(BX*4) // multiply
MOVL DX, CX MULL BX
ADDL $1, BX // i++ ADDL SI, AX
MOVL DX, SI
E5: CMPL BX, $0 // i < 0 ADCL $0, SI
JL L5 MOVL AX, 0(CX)(DI*4)
ADDL $1, DI; JNZ loop1cont
MOVL CX, c+32(FP) loop1done:
MOVL SI, c+32(FP)
RET RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word) // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVL z+0(FP), BP MOVL a+40(FP), BX
MOVL x+12(FP), DI MOVL z_len+4(FP), SI
MOVL y+24(FP), SI MOVL x_base+12(FP), DI
MOVL a+40(FP), CX MOVL y_base+24(FP), BP
MOVL z_len+4(FP), BX MOVL z_base+0(FP), CX
LEAL (DI)(BX*4), DI // use counter as negative index
LEAL (SI)(BX*4), SI LEAL (DI)(SI*4), DI
LEAL (BP)(BX*4), BP LEAL (BP)(SI*4), BP
NEGL BX // i = -n LEAL (CX)(SI*4), CX
JMP E6 NEGL SI
loop1:
L6: MOVL (SI)(BX*4), AX TESTL SI, SI; JZ loop1done
loop1cont:
// unroll 1X in batches of 1
MOVL 0(BP)(SI*4), AX
// multiply
MULL m+36(FP) MULL m+36(FP)
ADDL CX, AX ADDL BX, AX
ADCL $0, DX MOVL DX, BX
ADDL (DI)(BX*4), AX ADCL $0, BX
MOVL AX, (BP)(BX*4) // add
ADCL $0, DX ADDL 0(DI)(SI*4), AX
MOVL DX, CX ADCL $0, BX
ADDL $1, BX // i++ MOVL AX, 0(CX)(SI*4)
ADDL $1, SI; JNZ loop1cont
E6: CMPL BX, $0 // i < 0 loop1done:
JL L6 MOVL BX, c+44(FP)
MOVL CX, c+44(FP)
RET RET

View File

@ -8,4 +8,4 @@ package big
import "internal/cpu" import "internal/cpu"
var support_adx = cpu.X86.HasADX && cpu.X86.HasBMI2 var hasADX = cpu.X86.HasADX && cpu.X86.HasBMI2

View File

@ -1,408 +1,462 @@
// Copyright 2009 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go //go:build !math_big_pure_go
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
// This is faster than using rotate instructions.
// func addVV(z, x, y []Word) (c Word) // func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
MOVQ z_len+8(FP), DI MOVQ z_len+8(FP), BX
MOVQ x+24(FP), R8 MOVQ x_base+24(FP), SI
MOVQ y+48(FP), R9 MOVQ y_base+48(FP), DI
MOVQ z+0(FP), R10 MOVQ z_base+0(FP), R8
// compute unrolled loop lengths
MOVQ $0, CX // c = 0 MOVQ BX, R9
MOVQ $0, SI // i = 0 ANDQ $3, R9
SHRQ $2, BX
// s/JL/JMP/ below to disable the unrolled loop MOVQ $0, R10 // clear saved carry
SUBQ $4, DI // n -= 4 loop1:
JL V1 // if n < 0 goto V1 TESTQ R9, R9; JZ loop1done
loop1cont:
U1: // n >= 0 // unroll 1X
// regular loop body unrolled 4x ADDQ R10, R10 // restore carry
ADDQ CX, CX // restore CF MOVQ 0(SI), R10
MOVQ 0(R8)(SI*8), R11 ADCQ 0(DI), R10
MOVQ 8(R8)(SI*8), R12 MOVQ R10, 0(R8)
MOVQ 16(R8)(SI*8), R13 SBBQ R10, R10 // save carry
MOVQ 24(R8)(SI*8), R14 LEAQ 8(SI), SI // ADD $8, SI
ADCQ 0(R9)(SI*8), R11 LEAQ 8(DI), DI // ADD $8, DI
ADCQ 8(R9)(SI*8), R12 LEAQ 8(R8), R8 // ADD $8, R8
ADCQ 16(R9)(SI*8), R13 SUBQ $1, R9; JNZ loop1cont
ADCQ 24(R9)(SI*8), R14 loop1done:
MOVQ R11, 0(R10)(SI*8) loop4:
MOVQ R12, 8(R10)(SI*8) TESTQ BX, BX; JZ loop4done
MOVQ R13, 16(R10)(SI*8) loop4cont:
MOVQ R14, 24(R10)(SI*8) // unroll 4X
SBBQ CX, CX // save CF ADDQ R10, R10 // restore carry
MOVQ 0(SI), R9
ADDQ $4, SI // i += 4 MOVQ 8(SI), R10
SUBQ $4, DI // n -= 4 MOVQ 16(SI), R11
JGE U1 // if n >= 0 goto U1 MOVQ 24(SI), R12
ADCQ 0(DI), R9
V1: ADDQ $4, DI // n += 4 ADCQ 8(DI), R10
JLE E1 // if n <= 0 goto E1 ADCQ 16(DI), R11
ADCQ 24(DI), R12
L1: // n > 0 MOVQ R9, 0(R8)
ADDQ CX, CX // restore CF MOVQ R10, 8(R8)
MOVQ 0(R8)(SI*8), R11 MOVQ R11, 16(R8)
ADCQ 0(R9)(SI*8), R11 MOVQ R12, 24(R8)
MOVQ R11, 0(R10)(SI*8) SBBQ R10, R10 // save carry
SBBQ CX, CX // save CF LEAQ 32(SI), SI // ADD $32, SI
LEAQ 32(DI), DI // ADD $32, DI
ADDQ $1, SI // i++ LEAQ 32(R8), R8 // ADD $32, R8
SUBQ $1, DI // n-- SUBQ $1, BX; JNZ loop4cont
JG L1 // if n > 0 goto L1 loop4done:
NEGQ R10 // convert add carry
E1: NEGQ CX MOVQ R10, c+72(FP)
MOVQ CX, c+72(FP) // return c
RET RET
// func subVV(z, x, y []Word) (c Word) // func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SBBQ instead of ADCQ and label names)
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
MOVQ z_len+8(FP), DI MOVQ z_len+8(FP), BX
MOVQ x+24(FP), R8 MOVQ x_base+24(FP), SI
MOVQ y+48(FP), R9 MOVQ y_base+48(FP), DI
MOVQ z+0(FP), R10 MOVQ z_base+0(FP), R8
// compute unrolled loop lengths
MOVQ $0, CX // c = 0 MOVQ BX, R9
MOVQ $0, SI // i = 0 ANDQ $3, R9
SHRQ $2, BX
// s/JL/JMP/ below to disable the unrolled loop MOVQ $0, R10 // clear saved carry
SUBQ $4, DI // n -= 4 loop1:
JL V2 // if n < 0 goto V2 TESTQ R9, R9; JZ loop1done
loop1cont:
U2: // n >= 0 // unroll 1X
// regular loop body unrolled 4x ADDQ R10, R10 // restore carry
ADDQ CX, CX // restore CF MOVQ 0(SI), R10
MOVQ 0(R8)(SI*8), R11 SBBQ 0(DI), R10
MOVQ 8(R8)(SI*8), R12 MOVQ R10, 0(R8)
MOVQ 16(R8)(SI*8), R13 SBBQ R10, R10 // save carry
MOVQ 24(R8)(SI*8), R14 LEAQ 8(SI), SI // ADD $8, SI
SBBQ 0(R9)(SI*8), R11 LEAQ 8(DI), DI // ADD $8, DI
SBBQ 8(R9)(SI*8), R12 LEAQ 8(R8), R8 // ADD $8, R8
SBBQ 16(R9)(SI*8), R13 SUBQ $1, R9; JNZ loop1cont
SBBQ 24(R9)(SI*8), R14 loop1done:
MOVQ R11, 0(R10)(SI*8) loop4:
MOVQ R12, 8(R10)(SI*8) TESTQ BX, BX; JZ loop4done
MOVQ R13, 16(R10)(SI*8) loop4cont:
MOVQ R14, 24(R10)(SI*8) // unroll 4X
SBBQ CX, CX // save CF ADDQ R10, R10 // restore carry
MOVQ 0(SI), R9
ADDQ $4, SI // i += 4 MOVQ 8(SI), R10
SUBQ $4, DI // n -= 4 MOVQ 16(SI), R11
JGE U2 // if n >= 0 goto U2 MOVQ 24(SI), R12
SBBQ 0(DI), R9
V2: ADDQ $4, DI // n += 4 SBBQ 8(DI), R10
JLE E2 // if n <= 0 goto E2 SBBQ 16(DI), R11
SBBQ 24(DI), R12
L2: // n > 0 MOVQ R9, 0(R8)
ADDQ CX, CX // restore CF MOVQ R10, 8(R8)
MOVQ 0(R8)(SI*8), R11 MOVQ R11, 16(R8)
SBBQ 0(R9)(SI*8), R11 MOVQ R12, 24(R8)
MOVQ R11, 0(R10)(SI*8) SBBQ R10, R10 // save carry
SBBQ CX, CX // save CF LEAQ 32(SI), SI // ADD $32, SI
LEAQ 32(DI), DI // ADD $32, DI
ADDQ $1, SI // i++ LEAQ 32(R8), R8 // ADD $32, R8
SUBQ $1, DI // n-- SUBQ $1, BX; JNZ loop4cont
JG L2 // if n > 0 goto L2 loop4done:
NEGQ R10 // convert sub carry
E2: NEGQ CX MOVQ R10, c+72(FP)
MOVQ CX, c+72(FP) // return c
RET RET
// func lshVU(z, x []Word, s uint) (c Word) // func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
MOVQ z_len+8(FP), BX // i = z MOVQ z_len+8(FP), BX
SUBQ $1, BX // i-- TESTQ BX, BX; JZ ret0
JL X8b // i < 0 (n <= 0)
// n > 0
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ s+48(FP), CX MOVQ s+48(FP), CX
MOVQ (R8)(BX*8), AX // w1 = x[n-1] MOVQ x_base+24(FP), SI
MOVQ $0, DX MOVQ z_base+0(FP), DI
SHLQ CX, AX, DX // w1>>ŝ // run loop backward
MOVQ DX, c+56(FP) LEAQ (SI)(BX*8), SI
LEAQ (DI)(BX*8), DI
CMPQ BX, $0 // shift first word into carry
JLE X8a // i <= 0 MOVQ -8(SI), R8
MOVQ $0, R9
// i > 0 SHLQ CX, R8, R9
L8: MOVQ AX, DX // w = w1 MOVQ R9, c+56(FP)
MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] // shift remaining words
SHLQ CX, AX, DX // w<<s | w1>>ŝ SUBQ $1, BX
MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ // compute unrolled loop lengths
SUBQ $1, BX // i-- MOVQ BX, R9
JG L8 // i > 0 ANDQ $3, R9
SHRQ $2, BX
// i <= 0 loop1:
X8a: SHLQ CX, AX // w1<<s TESTQ R9, R9; JZ loop1done
MOVQ AX, (R10) // z[0] = w1<<s loop1cont:
// unroll 1X
MOVQ -16(SI), R10
SHLQ CX, R10, R8
MOVQ R8, -8(DI)
MOVQ R10, R8
LEAQ -8(SI), SI // ADD $-8, SI
LEAQ -8(DI), DI // ADD $-8, DI
SUBQ $1, R9; JNZ loop1cont
loop1done:
loop4:
TESTQ BX, BX; JZ loop4done
loop4cont:
// unroll 4X
MOVQ -16(SI), R9
MOVQ -24(SI), R10
MOVQ -32(SI), R11
MOVQ -40(SI), R12
SHLQ CX, R9, R8
SHLQ CX, R10, R9
SHLQ CX, R11, R10
SHLQ CX, R12, R11
MOVQ R8, -8(DI)
MOVQ R9, -16(DI)
MOVQ R10, -24(DI)
MOVQ R11, -32(DI)
MOVQ R12, R8
LEAQ -32(SI), SI // ADD $-32, SI
LEAQ -32(DI), DI // ADD $-32, DI
SUBQ $1, BX; JNZ loop4cont
loop4done:
// store final shifted bits
SHLQ CX, R8
MOVQ R8, -8(DI)
RET RET
ret0:
X8b: MOVQ $0, c+56(FP) MOVQ $0, c+56(FP)
RET RET
// func rshVU(z, x []Word, s uint) (c Word) // func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
MOVQ z_len+8(FP), R11 MOVQ z_len+8(FP), BX
SUBQ $1, R11 // n-- TESTQ BX, BX; JZ ret0
JL X9b // n < 0 (n <= 0)
// n > 0
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ s+48(FP), CX MOVQ s+48(FP), CX
MOVQ (R8), AX // w1 = x[0] MOVQ x_base+24(FP), SI
MOVQ $0, DX MOVQ z_base+0(FP), DI
SHRQ CX, AX, DX // w1<<ŝ // shift first word into carry
MOVQ DX, c+56(FP) MOVQ 0(SI), R8
MOVQ $0, R9
MOVQ $0, BX // i = 0 SHRQ CX, R8, R9
JMP E9 MOVQ R9, c+56(FP)
// shift remaining words
// i < n-1 SUBQ $1, BX
L9: MOVQ AX, DX // w = w1 // compute unrolled loop lengths
MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] MOVQ BX, R9
SHRQ CX, AX, DX // w>>s | w1<<ŝ ANDQ $3, R9
MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ SHRQ $2, BX
ADDQ $1, BX // i++ loop1:
TESTQ R9, R9; JZ loop1done
E9: CMPQ BX, R11 loop1cont:
JL L9 // i < n-1 // unroll 1X
MOVQ 8(SI), R10
// i >= n-1 SHRQ CX, R10, R8
X9a: SHRQ CX, AX // w1>>s MOVQ R8, 0(DI)
MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s MOVQ R10, R8
LEAQ 8(SI), SI // ADD $8, SI
LEAQ 8(DI), DI // ADD $8, DI
SUBQ $1, R9; JNZ loop1cont
loop1done:
loop4:
TESTQ BX, BX; JZ loop4done
loop4cont:
// unroll 4X
MOVQ 8(SI), R9
MOVQ 16(SI), R10
MOVQ 24(SI), R11
MOVQ 32(SI), R12
SHRQ CX, R9, R8
SHRQ CX, R10, R9
SHRQ CX, R11, R10
SHRQ CX, R12, R11
MOVQ R8, 0(DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
MOVQ R11, 24(DI)
MOVQ R12, R8
LEAQ 32(SI), SI // ADD $32, SI
LEAQ 32(DI), DI // ADD $32, DI
SUBQ $1, BX; JNZ loop4cont
loop4done:
// store final shifted bits
SHRQ CX, R8
MOVQ R8, 0(DI)
RET RET
ret0:
X9b: MOVQ $0, c+56(FP) MOVQ $0, c+56(FP)
RET RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word) // func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVQ z+0(FP), R10 MOVQ m+48(FP), BX
MOVQ x+24(FP), R8 MOVQ a+56(FP), SI
MOVQ m+48(FP), R9 MOVQ z_len+8(FP), DI
MOVQ a+56(FP), CX // c = a MOVQ x_base+24(FP), R8
MOVQ z_len+8(FP), R11 MOVQ z_base+0(FP), R9
MOVQ $0, BX // i = 0 // compute unrolled loop lengths
MOVQ DI, R10
CMPQ R11, $4 ANDQ $3, R10
JL E5 SHRQ $2, DI
loop1:
U5: // i+4 <= n TESTQ R10, R10; JZ loop1done
// regular loop body unrolled 4x loop1cont:
MOVQ (0*8)(R8)(BX*8), AX // unroll 1X in batches of 1
MULQ R9 MOVQ 0(R8), AX
ADDQ CX, AX // multiply
ADCQ $0, DX MULQ BX
MOVQ AX, (0*8)(R10)(BX*8) ADDQ SI, AX
MOVQ DX, CX MOVQ DX, SI
MOVQ (1*8)(R8)(BX*8), AX ADCQ $0, SI
MULQ R9 MOVQ AX, 0(R9)
ADDQ CX, AX LEAQ 8(R8), R8 // ADD $8, R8
ADCQ $0, DX LEAQ 8(R9), R9 // ADD $8, R9
MOVQ AX, (1*8)(R10)(BX*8) SUBQ $1, R10; JNZ loop1cont
MOVQ DX, CX loop1done:
MOVQ (2*8)(R8)(BX*8), AX loop4:
MULQ R9 TESTQ DI, DI; JZ loop4done
ADDQ CX, AX loop4cont:
ADCQ $0, DX // unroll 4X in batches of 1
MOVQ AX, (2*8)(R10)(BX*8) MOVQ 0(R8), AX
MOVQ DX, CX // multiply
MOVQ (3*8)(R8)(BX*8), AX MULQ BX
MULQ R9 ADDQ SI, AX
ADDQ CX, AX MOVQ DX, SI
ADCQ $0, DX ADCQ $0, SI
MOVQ AX, (3*8)(R10)(BX*8) MOVQ AX, 0(R9)
MOVQ DX, CX MOVQ 8(R8), AX
ADDQ $4, BX // i += 4 // multiply
MULQ BX
LEAQ 4(BX), DX ADDQ SI, AX
CMPQ DX, R11 MOVQ DX, SI
JLE U5 ADCQ $0, SI
JMP E5 MOVQ AX, 8(R9)
MOVQ 16(R8), AX
L5: MOVQ (R8)(BX*8), AX // multiply
MULQ R9 MULQ BX
ADDQ CX, AX ADDQ SI, AX
ADCQ $0, DX MOVQ DX, SI
MOVQ AX, (R10)(BX*8) ADCQ $0, SI
MOVQ DX, CX MOVQ AX, 16(R9)
ADDQ $1, BX // i++ MOVQ 24(R8), AX
// multiply
E5: CMPQ BX, R11 // i < n MULQ BX
JL L5 ADDQ SI, AX
MOVQ DX, SI
MOVQ CX, c+64(FP) ADCQ $0, SI
MOVQ AX, 24(R9)
LEAQ 32(R8), R8 // ADD $32, R8
LEAQ 32(R9), R9 // ADD $32, R9
SUBQ $1, DI; JNZ loop4cont
loop4done:
MOVQ SI, c+64(FP)
RET RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word) // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
CMPB ·support_adx(SB), $1 CMPB ·hasADX(SB), $0; JNZ altcarry
JEQ adx MOVQ m+72(FP), BX
MOVQ z+0(FP), R14 MOVQ a+80(FP), SI
MOVQ x+24(FP), R10 MOVQ z_len+8(FP), DI
MOVQ y+48(FP), R8 MOVQ x_base+24(FP), R8
MOVQ m+72(FP), R9 MOVQ y_base+48(FP), R9
MOVQ z_len+8(FP), R11 MOVQ z_base+0(FP), R10
MOVQ $0, BX // i = 0 // compute unrolled loop lengths
MOVQ a+80(FP), CX // c = 0 MOVQ DI, R11
MOVQ R11, R12 ANDQ $3, R11
ANDQ $-2, R12 SHRQ $2, DI
CMPQ R11, $2 loop1:
JAE A6 TESTQ R11, R11; JZ loop1done
JMP E6 loop1cont:
// unroll 1X in batches of 1
A6: MOVQ 0(R9), AX
MOVQ (R8)(BX*8), AX // multiply
MULQ R9 MULQ BX
ADDQ (R10)(BX*8), AX ADDQ SI, AX
ADCQ $0, DX MOVQ DX, SI
ADDQ CX, AX ADCQ $0, SI
ADCQ $0, DX // add
MOVQ DX, CX ADDQ 0(R8), AX
MOVQ AX, (R14)(BX*8) ADCQ $0, SI
MOVQ AX, 0(R10)
MOVQ (8)(R8)(BX*8), AX LEAQ 8(R8), R8 // ADD $8, R8
MULQ R9 LEAQ 8(R9), R9 // ADD $8, R9
ADDQ (8)(R10)(BX*8), AX LEAQ 8(R10), R10 // ADD $8, R10
ADCQ $0, DX SUBQ $1, R11; JNZ loop1cont
ADDQ CX, AX loop1done:
ADCQ $0, DX loop4:
MOVQ DX, CX TESTQ DI, DI; JZ loop4done
MOVQ AX, (8)(R14)(BX*8) loop4cont:
// unroll 4X in batches of 1
ADDQ $2, BX MOVQ 0(R9), AX
CMPQ BX, R12 // multiply
JL A6 MULQ BX
JMP E6 ADDQ SI, AX
MOVQ DX, SI
L6: MOVQ (R8)(BX*8), AX ADCQ $0, SI
MULQ R9 // add
ADDQ CX, AX ADDQ 0(R8), AX
ADCQ $0, DX ADCQ $0, SI
ADDQ (R10)(BX*8), AX MOVQ AX, 0(R10)
MOVQ AX, (R14)(BX*8) MOVQ 8(R9), AX
ADCQ $0, DX // multiply
MOVQ DX, CX MULQ BX
ADDQ $1, BX // i++ ADDQ SI, AX
MOVQ DX, SI
E6: CMPQ BX, R11 // i < n ADCQ $0, SI
JL L6 // add
ADDQ 8(R8), AX
MOVQ CX, c+88(FP) ADCQ $0, SI
MOVQ AX, 8(R10)
MOVQ 16(R9), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
// add
ADDQ 16(R8), AX
ADCQ $0, SI
MOVQ AX, 16(R10)
MOVQ 24(R9), AX
// multiply
MULQ BX
ADDQ SI, AX
MOVQ DX, SI
ADCQ $0, SI
// add
ADDQ 24(R8), AX
ADCQ $0, SI
MOVQ AX, 24(R10)
LEAQ 32(R8), R8 // ADD $32, R8
LEAQ 32(R9), R9 // ADD $32, R9
LEAQ 32(R10), R10 // ADD $32, R10
SUBQ $1, DI; JNZ loop4cont
loop4done:
MOVQ SI, c+88(FP)
RET RET
altcarry:
adx:
MOVQ z_len+8(FP), R11
MOVQ z+0(FP), R14
MOVQ x+24(FP), R10
MOVQ y+48(FP), R8
MOVQ m+72(FP), DX MOVQ m+72(FP), DX
MOVQ $0, BX // i = 0 MOVQ a+80(FP), BX
MOVQ a+80(FP), CX // carry MOVQ z_len+8(FP), SI
CMPQ R11, $8 MOVQ $0, DI
JAE adx_loop_header MOVQ x_base+24(FP), R8
CMPQ BX, R11 MOVQ y_base+48(FP), R9
JL adx_short MOVQ z_base+0(FP), R10
MOVQ CX, c+88(FP) // compute unrolled loop lengths
MOVQ SI, R11
ANDQ $7, R11
SHRQ $3, SI
alt1:
TESTQ R11, R11; JZ alt1done
alt1cont:
// unroll 1X
// multiply and add
TESTQ AX, AX // clear carry
TESTQ AX, AX // clear carry
MULXQ 0(R9), R13, R12
ADCXQ BX, R13
ADOXQ 0(R8), R13
MOVQ R13, 0(R10)
MOVQ R12, BX
ADCXQ DI, BX
ADOXQ DI, BX
LEAQ 8(R8), R8 // ADD $8, R8
LEAQ 8(R9), R9 // ADD $8, R9
LEAQ 8(R10), R10 // ADD $8, R10
SUBQ $1, R11; JNZ alt1cont
alt1done:
alt8:
TESTQ SI, SI; JZ alt8done
alt8cont:
// unroll 8X in batches of 2
// multiply and add
TESTQ AX, AX // clear carry
TESTQ AX, AX // clear carry
MULXQ 0(R9), R13, R11
ADCXQ BX, R13
ADOXQ 0(R8), R13
MULXQ 8(R9), R14, BX
ADCXQ R11, R14
ADOXQ 8(R8), R14
MOVQ R13, 0(R10)
MOVQ R14, 8(R10)
MULXQ 16(R9), R13, R11
ADCXQ BX, R13
ADOXQ 16(R8), R13
MULXQ 24(R9), R14, BX
ADCXQ R11, R14
ADOXQ 24(R8), R14
MOVQ R13, 16(R10)
MOVQ R14, 24(R10)
MULXQ 32(R9), R13, R11
ADCXQ BX, R13
ADOXQ 32(R8), R13
MULXQ 40(R9), R14, BX
ADCXQ R11, R14
ADOXQ 40(R8), R14
MOVQ R13, 32(R10)
MOVQ R14, 40(R10)
MULXQ 48(R9), R13, R11
ADCXQ BX, R13
ADOXQ 48(R8), R13
MULXQ 56(R9), R14, BX
ADCXQ R11, R14
ADOXQ 56(R8), R14
MOVQ R13, 48(R10)
MOVQ R14, 56(R10)
ADCXQ DI, BX
ADOXQ DI, BX
LEAQ 64(R8), R8 // ADD $64, R8
LEAQ 64(R9), R9 // ADD $64, R9
LEAQ 64(R10), R10 // ADD $64, R10
SUBQ $1, SI; JNZ alt8cont
alt8done:
MOVQ BX, c+88(FP)
RET RET
adx_loop_header:
MOVQ R11, R13
ANDQ $-8, R13
adx_loop:
XORQ R9, R9 // unset flags
MULXQ (R8), SI, DI
ADCXQ CX,SI
ADOXQ (R10), SI
MOVQ SI,(R14)
MULXQ 8(R8), AX, CX
ADCXQ DI, AX
ADOXQ 8(R10), AX
MOVQ AX, 8(R14)
MULXQ 16(R8), SI, DI
ADCXQ CX, SI
ADOXQ 16(R10), SI
MOVQ SI, 16(R14)
MULXQ 24(R8), AX, CX
ADCXQ DI, AX
ADOXQ 24(R10), AX
MOVQ AX, 24(R14)
MULXQ 32(R8), SI, DI
ADCXQ CX, SI
ADOXQ 32(R10), SI
MOVQ SI, 32(R14)
MULXQ 40(R8), AX, CX
ADCXQ DI, AX
ADOXQ 40(R10), AX
MOVQ AX, 40(R14)
MULXQ 48(R8), SI, DI
ADCXQ CX, SI
ADOXQ 48(R10), SI
MOVQ SI, 48(R14)
MULXQ 56(R8), AX, CX
ADCXQ DI, AX
ADOXQ 56(R10), AX
MOVQ AX, 56(R14)
ADCXQ R9, CX
ADOXQ R9, CX
ADDQ $64, R8
ADDQ $64, R10
ADDQ $64, R14
ADDQ $8, BX
CMPQ BX, R13
JL adx_loop
MOVQ z+0(FP), R14
MOVQ x+24(FP), R10
MOVQ y+48(FP), R8
CMPQ BX, R11
JL adx_short
MOVQ CX, c+88(FP)
RET
adx_short:
MULXQ (R8)(BX*8), SI, DI
ADDQ CX, SI
ADCQ $0, DI
ADDQ (R10)(BX*8), SI
MOVQ SI, (R14)(BX*8)
ADCQ $0, DI
MOVQ DI, CX
ADDQ $1, BX // i++
CMPQ BX, R11
JL adx_short
MOVQ CX, c+88(FP)
RET

View File

@ -0,0 +1,14 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !math_big_pure_go
package big
import "testing"
func TestAddMulVVWWNoADX(t *testing.T) {
setDuringTest(t, &hasADX, false)
TestAddMulVVWW(t)
}

View File

@ -1,197 +1,355 @@
// Copyright 2009 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go //go:build !math_big_pure_go
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func addVV(z, x, y []Word) (c Word) // func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
ADD.S $0, R0 // clear carry flag MOVW z_len+4(FP), R0
MOVW z+0(FP), R1 MOVW x_base+12(FP), R1
MOVW z_len+4(FP), R4 MOVW y_base+24(FP), R2
MOVW x+12(FP), R2 MOVW z_base+0(FP), R3
MOVW y+24(FP), R3 // compute unrolled loop lengths
ADD R4<<2, R1, R4 AND $3, R0, R4
B E1 MOVW R0>>2, R0
L1: ADD.S $0, R0 // clear carry
MOVW.P 4(R2), R5 loop1:
MOVW.P 4(R3), R6 TEQ $0, R4; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R1), R5
MOVW.P 4(R2), R6
ADC.S R6, R5 ADC.S R6, R5
MOVW.P R5, 4(R1) MOVW.P R5, 4(R3)
E1: SUB $1, R4
TEQ R1, R4 TEQ $0, R4; BNE loop1cont
BNE L1 loop1done:
loop4:
MOVW $0, R0 TEQ $0, R0; BEQ loop4done
MOVW.CS $1, R0 loop4cont:
MOVW R0, c+36(FP) // unroll 4X
MOVW.P 4(R1), R4
MOVW.P 4(R1), R5
MOVW.P 4(R1), R6
MOVW.P 4(R1), R7
MOVW.P 4(R2), R8
MOVW.P 4(R2), R9
MOVW.P 4(R2), R11
MOVW.P 4(R2), R12
ADC.S R8, R4
ADC.S R9, R5
ADC.S R11, R6
ADC.S R12, R7
MOVW.P R4, 4(R3)
MOVW.P R5, 4(R3)
MOVW.P R6, 4(R3)
MOVW.P R7, 4(R3)
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
SBC R1, R1 // save carry
ADD $1, R1 // convert add carry
MOVW R1, c+36(FP)
RET RET
// func subVV(z, x, y []Word) (c Word) // func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SBC instead of ADC and label names)
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
SUB.S $0, R0 // clear borrow flag MOVW z_len+4(FP), R0
MOVW z+0(FP), R1 MOVW x_base+12(FP), R1
MOVW z_len+4(FP), R4 MOVW y_base+24(FP), R2
MOVW x+12(FP), R2 MOVW z_base+0(FP), R3
MOVW y+24(FP), R3 // compute unrolled loop lengths
ADD R4<<2, R1, R4 AND $3, R0, R4
B E2 MOVW R0>>2, R0
L2: SUB.S $0, R0 // clear carry
MOVW.P 4(R2), R5 loop1:
MOVW.P 4(R3), R6 TEQ $0, R4; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R1), R5
MOVW.P 4(R2), R6
SBC.S R6, R5 SBC.S R6, R5
MOVW.P R5, 4(R1) MOVW.P R5, 4(R3)
E2: SUB $1, R4
TEQ R1, R4 TEQ $0, R4; BNE loop1cont
BNE L2 loop1done:
loop4:
MOVW $0, R0 TEQ $0, R0; BEQ loop4done
MOVW.CC $1, R0 loop4cont:
MOVW R0, c+36(FP) // unroll 4X
MOVW.P 4(R1), R4
MOVW.P 4(R1), R5
MOVW.P 4(R1), R6
MOVW.P 4(R1), R7
MOVW.P 4(R2), R8
MOVW.P 4(R2), R9
MOVW.P 4(R2), R11
MOVW.P 4(R2), R12
SBC.S R8, R4
SBC.S R9, R5
SBC.S R11, R6
SBC.S R12, R7
MOVW.P R4, 4(R3)
MOVW.P R5, 4(R3)
MOVW.P R6, 4(R3)
MOVW.P R7, 4(R3)
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
SBC R1, R1 // save carry
RSB $0, R1, R1 // convert sub carry
MOVW R1, c+36(FP)
RET RET
// func lshVU(z, x []Word, s uint) (c Word) // func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
MOVW z_len+4(FP), R5 MOVW z_len+4(FP), R0
TEQ $0, R5 TEQ $0, R0; BEQ ret0
BEQ X7 MOVW s+24(FP), R1
MOVW x_base+12(FP), R2
MOVW z+0(FP), R1 MOVW z_base+0(FP), R3
MOVW x+12(FP), R2 // run loop backward
ADD R5<<2, R2, R2 ADD R0<<2, R2, R2
ADD R5<<2, R1, R5 ADD R0<<2, R3, R3
MOVW s+24(FP), R3 // shift first word into carry
ADD $4, R1 // stop one word early MOVW.W -4(R2), R4
MOVW $32, R4 MOVW $32, R5
SUB R3, R4 SUB R1, R5
MOVW $0, R7 MOVW R4>>R5, R6
MOVW R4<<R1, R4
MOVW.W -4(R2), R6
MOVW R6<<R3, R7
MOVW R6>>R4, R6
MOVW R6, c+28(FP) MOVW R6, c+28(FP)
B E7 // shift remaining words
SUB $1, R0
L7: // compute unrolled loop lengths
AND $3, R0, R6
MOVW R0>>2, R0
loop1:
TEQ $0, R6; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.W -4(R2), R7
ORR R7>>R5, R4
MOVW.W R4, -4(R3)
MOVW R7<<R1, R4
SUB $1, R6
TEQ $0, R6; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.W -4(R2), R6 MOVW.W -4(R2), R6
ORR R6>>R4, R7 MOVW.W -4(R2), R7
MOVW.W R7, -4(R5) MOVW.W -4(R2), R8
MOVW R6<<R3, R7 MOVW.W -4(R2), R9
E7: ORR R6>>R5, R4
TEQ R1, R5 MOVW.W R4, -4(R3)
BNE L7 MOVW R6<<R1, R4
ORR R7>>R5, R4
MOVW R7, -4(R5) MOVW.W R4, -4(R3)
MOVW R7<<R1, R4
ORR R8>>R5, R4
MOVW.W R4, -4(R3)
MOVW R8<<R1, R4
ORR R9>>R5, R4
MOVW.W R4, -4(R3)
MOVW R9<<R1, R4
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
// store final shifted bits
MOVW.W R4, -4(R3)
RET RET
ret0:
X7:
MOVW $0, R1 MOVW $0, R1
MOVW R1, c+28(FP) MOVW R1, c+28(FP)
RET RET
// func rshVU(z, x []Word, s uint) (c Word) // func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
MOVW z_len+4(FP), R5 MOVW z_len+4(FP), R0
TEQ $0, R5 TEQ $0, R0; BEQ ret0
BEQ X6 MOVW s+24(FP), R1
MOVW x_base+12(FP), R2
MOVW z+0(FP), R1 MOVW z_base+0(FP), R3
MOVW x+12(FP), R2 // shift first word into carry
ADD R5<<2, R1, R5 MOVW.P 4(R2), R4
MOVW s+24(FP), R3 MOVW $32, R5
SUB $4, R5 // stop one word early SUB R1, R5
MOVW $32, R4 MOVW R4<<R5, R6
SUB R3, R4 MOVW R4>>R1, R4
MOVW $0, R7
// first word
MOVW.P 4(R2), R6
MOVW R6>>R3, R7
MOVW R6<<R4, R6
MOVW R6, c+28(FP) MOVW R6, c+28(FP)
B E6 // shift remaining words
SUB $1, R0
// word loop // compute unrolled loop lengths
L6: AND $3, R0, R6
MOVW R0>>2, R0
loop1:
TEQ $0, R6; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R2), R7
ORR R7<<R5, R4
MOVW.P R4, 4(R3)
MOVW R7>>R1, R4
SUB $1, R6
TEQ $0, R6; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.P 4(R2), R6 MOVW.P 4(R2), R6
ORR R6<<R4, R7 MOVW.P 4(R2), R7
MOVW.P R7, 4(R1) MOVW.P 4(R2), R8
MOVW R6>>R3, R7 MOVW.P 4(R2), R9
E6: ORR R6<<R5, R4
TEQ R1, R5 MOVW.P R4, 4(R3)
BNE L6 MOVW R6>>R1, R4
ORR R7<<R5, R4
MOVW R7, 0(R1) MOVW.P R4, 4(R3)
MOVW R7>>R1, R4
ORR R8<<R5, R4
MOVW.P R4, 4(R3)
MOVW R8>>R1, R4
ORR R9<<R5, R4
MOVW.P R4, 4(R3)
MOVW R9>>R1, R4
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
// store final shifted bits
MOVW.P R4, 4(R3)
RET RET
ret0:
X6:
MOVW $0, R1 MOVW $0, R1
MOVW R1, c+28(FP) MOVW R1, c+28(FP)
RET RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word) // func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVW $0, R0 MOVW m+24(FP), R0
MOVW z+0(FP), R1 MOVW a+28(FP), R1
MOVW z_len+4(FP), R5 MOVW z_len+4(FP), R2
MOVW x+12(FP), R2 MOVW x_base+12(FP), R3
MOVW m+24(FP), R3 MOVW z_base+0(FP), R4
MOVW a+28(FP), R4 // compute unrolled loop lengths
ADD R5<<2, R1, R5 AND $3, R2, R5
B E8 MOVW R2>>2, R2
loop1:
// word loop TEQ $0, R5; BEQ loop1done
L8: loop1cont:
MOVW.P 4(R2), R6 // unroll 1X
MULLU R6, R3, (R7, R6) MOVW.P 4(R3), R6
ADD.S R4, R6 // multiply
ADC R0, R7 MULLU R0, R6, (R7, R6)
MOVW.P R6, 4(R1) ADD.S R1, R6
MOVW R7, R4 ADC $0, R7, R1
E8: MOVW.P R6, 4(R4)
TEQ R1, R5 SUB $1, R5
BNE L8 TEQ $0, R5; BNE loop1cont
loop1done:
MOVW R4, c+32(FP) loop4:
TEQ $0, R2; BEQ loop4done
loop4cont:
// unroll 4X in batches of 2
MOVW.P 4(R3), R5
MOVW.P 4(R3), R6
// multiply
MULLU R0, R5, (R7, R5)
ADD.S R1, R5
MULLU R0, R6, (R8, R6)
ADC.S R7, R6
ADC $0, R8, R1
MOVW.P R5, 4(R4)
MOVW.P R6, 4(R4)
MOVW.P 4(R3), R5
MOVW.P 4(R3), R6
// multiply
MULLU R0, R5, (R7, R5)
ADD.S R1, R5
MULLU R0, R6, (R8, R6)
ADC.S R7, R6
ADC $0, R8, R1
MOVW.P R5, 4(R4)
MOVW.P R6, 4(R4)
SUB $1, R2
TEQ $0, R2; BNE loop4cont
loop4done:
MOVW R1, c+32(FP)
RET RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word) // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVW $0, R0 MOVW m+36(FP), R0
MOVW z+0(FP), R9 MOVW a+40(FP), R1
MOVW x+12(FP), R1 MOVW z_len+4(FP), R2
MOVW z_len+4(FP), R5 MOVW x_base+12(FP), R3
MOVW y+24(FP), R2 MOVW y_base+24(FP), R4
MOVW m+36(FP), R3 MOVW z_base+0(FP), R5
ADD R5<<2, R1, R5 // compute unrolled loop lengths
MOVW a+40(FP), R4 AND $3, R2, R6
B E9 MOVW R2>>2, R2
loop1:
// word loop TEQ $0, R6; BEQ loop1done
L9: loop1cont:
MOVW.P 4(R2), R6 // unroll 1X
MULLU R6, R3, (R7, R6) MOVW.P 4(R3), R7
ADD.S R4, R6 MOVW.P 4(R4), R8
ADC R0, R7 // multiply
MOVW.P 4(R1), R4 MULLU R0, R8, (R9, R8)
ADD.S R4, R6 ADD.S R1, R8
ADC R0, R7 ADC $0, R9, R1
MOVW.P R6, 4(R9) // add
MOVW R7, R4 ADD.S R7, R8
E9: ADC $0, R1
TEQ R1, R5 MOVW.P R8, 4(R5)
BNE L9 SUB $1, R6
TEQ $0, R6; BNE loop1cont
MOVW R4, c+44(FP) loop1done:
loop4:
TEQ $0, R2; BEQ loop4done
loop4cont:
// unroll 4X in batches of 2
MOVW.P 4(R3), R6
MOVW.P 4(R3), R7
MOVW.P 4(R4), R8
MOVW.P 4(R4), R9
// multiply
MULLU R0, R8, (R11, R8)
ADD.S R1, R8
MULLU R0, R9, (R12, R9)
ADC.S R11, R9
ADC $0, R12, R1
// add
ADD.S R6, R8
ADC.S R7, R9
ADC $0, R1
MOVW.P R8, 4(R5)
MOVW.P R9, 4(R5)
MOVW.P 4(R3), R6
MOVW.P 4(R3), R7
MOVW.P 4(R4), R8
MOVW.P 4(R4), R9
// multiply
MULLU R0, R8, (R11, R8)
ADD.S R1, R8
MULLU R0, R9, (R12, R9)
ADC.S R11, R9
ADC $0, R12, R1
// add
ADD.S R6, R8
ADC.S R7, R9
ADC $0, R1
MOVW.P R8, 4(R5)
MOVW.P R9, 4(R5)
SUB $1, R2
TEQ $0, R2; BNE loop4cont
loop4done:
MOVW R1, c+44(FP)
RET RET

View File

@ -1,375 +1,374 @@
// Copyright 2013 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go //go:build !math_big_pure_go
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// TODO: Consider re-implementing using Advanced SIMD
// once the assembler supports those instructions.
// func addVV(z, x, y []Word) (c Word) // func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0 MOVD z_len+8(FP), R0
MOVD x+24(FP), R8 MOVD x_base+24(FP), R1
MOVD y+48(FP), R9 MOVD y_base+48(FP), R2
MOVD z+0(FP), R10 MOVD z_base+0(FP), R3
ADDS $0, R0 // clear carry flag // compute unrolled loop lengths
TBZ $0, R0, two AND $3, R0, R4
MOVD.P 8(R8), R11 LSR $2, R0
MOVD.P 8(R9), R15 ADDS ZR, R0 // clear carry
ADCS R15, R11 loop1:
MOVD.P R11, 8(R10) CBZ R4, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R1), R5
MOVD.P 8(R2), R6
ADCS R6, R5
MOVD.P R5, 8(R3)
SUB $1, R4
CBNZ R4, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R1), (R4, R5)
LDP -16(R1), (R6, R7)
LDP.P 32(R2), (R8, R9)
LDP -16(R2), (R10, R11)
ADCS R8, R4
ADCS R9, R5
ADCS R10, R6
ADCS R11, R7
STP.P (R4, R5), 32(R3)
STP (R6, R7), -16(R3)
SUB $1, R0 SUB $1, R0
two: CBNZ R0, loop4cont
TBZ $1, R0, loop loop4done:
LDP.P 16(R8), (R11, R12) ADC ZR, ZR, R1 // save & convert add carry
LDP.P 16(R9), (R15, R16) MOVD R1, c+72(FP)
ADCS R15, R11
ADCS R16, R12
STP.P (R11, R12), 16(R10)
SUB $2, R0
loop:
CBZ R0, done // careful not to touch the carry flag
LDP.P 32(R8), (R11, R12)
LDP -16(R8), (R13, R14)
LDP.P 32(R9), (R15, R16)
LDP -16(R9), (R17, R19)
ADCS R15, R11
ADCS R16, R12
ADCS R17, R13
ADCS R19, R14
STP.P (R11, R12), 32(R10)
STP (R13, R14), -16(R10)
SUB $4, R0
B loop
done:
CSET HS, R0 // extract carry flag
MOVD R0, c+72(FP)
RET RET
// func subVV(z, x, y []Word) (c Word) // func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0 MOVD z_len+8(FP), R0
MOVD x+24(FP), R8 MOVD x_base+24(FP), R1
MOVD y+48(FP), R9 MOVD y_base+48(FP), R2
MOVD z+0(FP), R10 MOVD z_base+0(FP), R3
CMP R0, R0 // set carry flag // compute unrolled loop lengths
TBZ $0, R0, two AND $3, R0, R4
MOVD.P 8(R8), R11 LSR $2, R0
MOVD.P 8(R9), R15 SUBS ZR, R0 // clear carry
SBCS R15, R11 loop1:
MOVD.P R11, 8(R10) CBZ R4, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R1), R5
MOVD.P 8(R2), R6
SBCS R6, R5
MOVD.P R5, 8(R3)
SUB $1, R4
CBNZ R4, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R1), (R4, R5)
LDP -16(R1), (R6, R7)
LDP.P 32(R2), (R8, R9)
LDP -16(R2), (R10, R11)
SBCS R8, R4
SBCS R9, R5
SBCS R10, R6
SBCS R11, R7
STP.P (R4, R5), 32(R3)
STP (R6, R7), -16(R3)
SUB $1, R0 SUB $1, R0
two: CBNZ R0, loop4cont
TBZ $1, R0, loop loop4done:
LDP.P 16(R8), (R11, R12) SBC R1, R1 // save carry
LDP.P 16(R9), (R15, R16) SUB R1, ZR, R1 // convert sub carry
SBCS R15, R11 MOVD R1, c+72(FP)
SBCS R16, R12
STP.P (R11, R12), 16(R10)
SUB $2, R0
loop:
CBZ R0, done // careful not to touch the carry flag
LDP.P 32(R8), (R11, R12)
LDP -16(R8), (R13, R14)
LDP.P 32(R9), (R15, R16)
LDP -16(R9), (R17, R19)
SBCS R15, R11
SBCS R16, R12
SBCS R17, R13
SBCS R19, R14
STP.P (R11, R12), 32(R10)
STP (R13, R14), -16(R10)
SUB $4, R0
B loop
done:
CSET LO, R0 // extract carry flag
MOVD R0, c+72(FP)
RET RET
// func lshVU(z, x []Word, s uint) (c Word) // func lshVU(z, x []Word, s uint) (c Word)
// This implementation handles the shift operation from the high word to the low word,
// which may be an error for the case where the low word of x overlaps with the high
// word of z. When calling this function directly, you need to pay attention to this
// situation.
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) MOVD z_len+8(FP), R0
MOVD x+24(FP), R2 CBZ R0, ret0
MOVD s+48(FP), R3 MOVD s+48(FP), R1
ADD R1<<3, R0 // R0 = &z[n] MOVD x_base+24(FP), R2
ADD R1<<3, R2 // R2 = &x[n] MOVD z_base+0(FP), R3
CBZ R1, len0 // run loop backward
MOVD $64, R4 ADD R0<<3, R2, R2
SUB R3, R4 ADD R0<<3, R3, R3
// handling the most significant element x[n-1] // shift first word into carry
MOVD.W -8(R2), R6 MOVD.W -8(R2), R4
LSR R4, R6, R5 // return value MOVD $64, R5
LSL R3, R6, R8 // x[i] << s SUB R1, R5
SUB $1, R1 LSR R5, R4, R6
one: TBZ $0, R1, two LSL R1, R4
MOVD.W -8(R2), R6 MOVD R6, c+56(FP)
LSR R4, R6, R7 // shift remaining words
ORR R8, R7 SUB $1, R0
LSL R3, R6, R8 // compute unrolled loop lengths
SUB $1, R1 AND $3, R0, R6
MOVD.W R7, -8(R0) LSR $2, R0
two: loop1:
TBZ $1, R1, loop CBZ R6, loop1done
LDP.W -16(R2), (R6, R7) loop1cont:
LSR R4, R7, R10 // unroll 1X
ORR R8, R10 MOVD.W -8(R2), R7
LSL R3, R7 LSR R5, R7, R8
LSR R4, R6, R9 ORR R4, R8
ORR R7, R9 LSL R1, R7, R4
LSL R3, R6, R8 MOVD.W R8, -8(R3)
SUB $2, R1 SUB $1, R6
STP.W (R9, R10), -16(R0) CBNZ R6, loop1cont
loop: loop1done:
CBZ R1, done loop4:
LDP.W -32(R2), (R10, R11) CBZ R0, loop4done
LDP 16(R2), (R12, R13) loop4cont:
LSR R4, R13, R23 // unroll 4X
ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) LDP.W -32(R2), (R9, R8)
LSL R3, R13 LDP 16(R2), (R7, R6)
LSR R4, R12, R22 LSR R5, R6, R10
ORR R13, R22 ORR R4, R10
LSL R3, R12 LSL R1, R6, R4
LSR R4, R11, R21 LSR R5, R7, R6
ORR R12, R21 ORR R4, R6
LSL R3, R11 LSL R1, R7, R4
LSR R4, R10, R20 LSR R5, R8, R7
ORR R11, R20 ORR R4, R7
LSL R3, R10, R8 LSL R1, R8, R4
STP.W (R20, R21), -32(R0) LSR R5, R9, R8
STP (R22, R23), 16(R0) ORR R4, R8
SUB $4, R1 LSL R1, R9, R4
B loop STP.W (R8, R7), -32(R3)
done: STP (R6, R10), 16(R3)
MOVD.W R8, -8(R0) // the first element x[0] SUB $1, R0
MOVD R5, c+56(FP) // the part moved out from x[n-1] CBNZ R0, loop4cont
loop4done:
// store final shifted bits
MOVD.W R4, -8(R3)
RET RET
len0: ret0:
MOVD $0, c+56(FP) MOVD ZR, c+56(FP)
RET RET
// func rshVU(z, x []Word, s uint) (c Word) // func rshVU(z, x []Word, s uint) (c Word)
// This implementation handles the shift operation from the low word to the high word,
// which may be an error for the case where the high word of x overlaps with the low
// word of z. When calling this function directly, you need to pay attention to this
// situation.
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
MOVD z+0(FP), R0 MOVD z_len+8(FP), R0
MOVD z_len+8(FP), R1 CBZ R0, ret0
MOVD x+24(FP), R2 MOVD s+48(FP), R1
MOVD s+48(FP), R3 MOVD x_base+24(FP), R2
MOVD $0, R8 MOVD z_base+0(FP), R3
MOVD $64, R4 // shift first word into carry
SUB R3, R4 MOVD.P 8(R2), R4
CBZ R1, len0 MOVD $64, R5
SUB R1, R5
MOVD.P 8(R2), R20 LSL R5, R4, R6
LSR R3, R20, R8 LSR R1, R4
LSL R4, R20 MOVD R6, c+56(FP)
MOVD R20, c+56(FP) // deal with the first element // shift remaining words
SUB $1, R1 SUB $1, R0
// compute unrolled loop lengths
TBZ $0, R1, two AND $3, R0, R6
MOVD.P 8(R2), R6 LSR $2, R0
LSL R4, R6, R20 loop1:
ORR R8, R20 CBZ R6, loop1done
LSR R3, R6, R8 loop1cont:
MOVD.P R20, 8(R0) // unroll 1X
SUB $1, R1 MOVD.P 8(R2), R7
two: LSL R5, R7, R8
TBZ $1, R1, loop ORR R4, R8
LDP.P 16(R2), (R6, R7) LSR R1, R7, R4
LSL R4, R6, R20 MOVD.P R8, 8(R3)
LSR R3, R6 SUB $1, R6
ORR R8, R20 CBNZ R6, loop1cont
LSL R4, R7, R21 loop1done:
LSR R3, R7, R8 loop4:
ORR R6, R21 CBZ R0, loop4done
STP.P (R20, R21), 16(R0) loop4cont:
SUB $2, R1 // unroll 4X
loop: LDP.P 32(R2), (R6, R7)
CBZ R1, done LDP -16(R2), (R8, R9)
LDP.P 32(R2), (R10, R11) LSL R5, R6, R10
LDP -16(R2), (R12, R13) ORR R4, R10
LSL R4, R10, R20 LSR R1, R6, R4
LSR R3, R10 LSL R5, R7, R6
ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) ORR R4, R6
LSL R4, R11, R21 LSR R1, R7, R4
LSR R3, R11 LSL R5, R8, R7
ORR R10, R21 ORR R4, R7
LSL R4, R12, R22 LSR R1, R8, R4
LSR R3, R12 LSL R5, R9, R8
ORR R11, R22 ORR R4, R8
LSL R4, R13, R23 LSR R1, R9, R4
LSR R3, R13, R8 STP.P (R10, R6), 32(R3)
ORR R12, R23 STP (R7, R8), -16(R3)
STP.P (R20, R21), 32(R0) SUB $1, R0
STP (R22, R23), -16(R0) CBNZ R0, loop4cont
SUB $4, R1 loop4done:
B loop // store final shifted bits
done: MOVD.P R4, 8(R3)
MOVD R8, (R0) // deal with the last element
RET RET
len0: ret0:
MOVD $0, c+56(FP) MOVD ZR, c+56(FP)
RET RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word) // func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R1 MOVD m+48(FP), R0
MOVD z_len+8(FP), R0 MOVD a+56(FP), R1
MOVD x+24(FP), R2 MOVD z_len+8(FP), R2
MOVD m+48(FP), R3 MOVD x_base+24(FP), R3
MOVD a+56(FP), R4 MOVD z_base+0(FP), R4
// c, z = x * y + r // compute unrolled loop lengths
TBZ $0, R0, two AND $7, R2, R5
MOVD.P 8(R2), R5 LSR $3, R2
MUL R3, R5, R7 loop1:
UMULH R3, R5, R8 CBZ R5, loop1done
ADDS R4, R7 loop1cont:
ADC $0, R8, R4 // c, z[i] = x[i] * y + r // unroll 1X
MOVD.P R7, 8(R1) MOVD.P 8(R3), R6
SUB $1, R0 // multiply
two: UMULH R0, R6, R7
TBZ $1, R0, loop MUL R0, R6
LDP.P 16(R2), (R5, R6) ADDS R1, R6
MUL R3, R5, R10 ADC ZR, R7, R1
UMULH R3, R5, R11 MOVD.P R6, 8(R4)
ADDS R4, R10 SUB $1, R5
MUL R3, R6, R12 CBNZ R5, loop1cont
UMULH R3, R6, R13 loop1done:
ADCS R12, R11 loop8:
ADC $0, R13, R4 CBZ R2, loop8done
loop8cont:
STP.P (R10, R11), 16(R1) // unroll 8X
SUB $2, R0 LDP.P 64(R3), (R5, R6)
loop: LDP -48(R3), (R7, R8)
CBZ R0, done LDP -32(R3), (R9, R10)
LDP.P 32(R2), (R5, R6) LDP -16(R3), (R11, R12)
LDP -16(R2), (R7, R8) // multiply
UMULH R0, R5, R13
MUL R3, R5, R10 MUL R0, R5
UMULH R3, R5, R11 ADDS R1, R5
ADDS R4, R10 UMULH R0, R6, R14
MUL R3, R6, R12 MUL R0, R6
UMULH R3, R6, R13 ADCS R13, R6
ADCS R11, R12 UMULH R0, R7, R13
MUL R0, R7
MUL R3, R7, R14 ADCS R14, R7
UMULH R3, R7, R15 UMULH R0, R8, R14
ADCS R13, R14 MUL R0, R8
MUL R3, R8, R16 ADCS R13, R8
UMULH R3, R8, R17 UMULH R0, R9, R13
ADCS R15, R16 MUL R0, R9
ADC $0, R17, R4 ADCS R14, R9
UMULH R0, R10, R14
STP.P (R10, R12), 32(R1) MUL R0, R10
STP (R14, R16), -16(R1) ADCS R13, R10
SUB $4, R0 UMULH R0, R11, R13
B loop MUL R0, R11
done: ADCS R14, R11
MOVD R4, c+64(FP) UMULH R0, R12, R14
MUL R0, R12
ADCS R13, R12
ADC ZR, R14, R1
STP.P (R5, R6), 64(R4)
STP (R7, R8), -48(R4)
STP (R9, R10), -32(R4)
STP (R11, R12), -16(R4)
SUB $1, R2
CBNZ R2, loop8cont
loop8done:
MOVD R1, c+64(FP)
RET RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word) // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R22 MOVD m+72(FP), R0
MOVD x+24(FP), R1 MOVD a+80(FP), R1
MOVD z_len+8(FP), R0 MOVD z_len+8(FP), R2
MOVD y+48(FP), R2 MOVD x_base+24(FP), R3
MOVD m+72(FP), R3 MOVD y_base+48(FP), R4
MOVD a+80(FP), R4 MOVD z_base+0(FP), R5
// compute unrolled loop lengths
TBZ $0, R0, two AND $7, R2, R6
LSR $3, R2
MOVD.P 8(R2), R5 loop1:
MOVD.P 8(R1), R6 CBZ R6, loop1done
loop1cont:
MUL R5, R3, R7 // unroll 1X
UMULH R5, R3, R8 MOVD.P 8(R3), R7
MOVD.P 8(R4), R8
ADDS R4, R7 // multiply
ADC $0, R8 UMULH R0, R8, R9
ADDS R7, R6 MUL R0, R8
ADC $0, R8, R4 ADDS R1, R8
ADC ZR, R9, R1
MOVD.P R6, 8(R22) // add
SUB $1, R0 ADDS R7, R8
ADC ZR, R1
two: MOVD.P R8, 8(R5)
TBZ $1, R0, loop SUB $1, R6
CBNZ R6, loop1cont
LDP.P 16(R2), (R5, R10) loop1done:
LDP.P 16(R1), (R6, R11) loop8:
CBZ R2, loop8done
MUL R10, R3, R13 loop8cont:
UMULH R10, R3, R12 // unroll 8X
LDP.P 64(R3), (R6, R7)
MUL R5, R3, R7 LDP -48(R3), (R8, R9)
UMULH R5, R3, R8 LDP -32(R3), (R10, R11)
LDP -16(R3), (R12, R13)
ADDS R4, R6 LDP.P 64(R4), (R14, R15)
ADCS R13, R11 LDP -48(R4), (R16, R17)
ADC $0, R12 LDP -32(R4), (R19, R20)
LDP -16(R4), (R21, R22)
ADDS R7, R6 // multiply
ADCS R8, R11 UMULH R0, R14, R23
ADC $0, R12, R4 MUL R0, R14
ADDS R1, R14
STP.P (R6, R11), 16(R22) UMULH R0, R15, R24
SUB $2, R0 MUL R0, R15
ADCS R23, R15
// The main loop of this code operates on a block of 4 words every iteration UMULH R0, R16, R23
// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] MUL R0, R16
// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next ADCS R24, R16
// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. UMULH R0, R17, R24
loop: MUL R0, R17
CBZ R0, done ADCS R23, R17
UMULH R0, R19, R23
LDP.P 16(R2), (R5, R6) MUL R0, R19
LDP.P 16(R2), (R7, R8) ADCS R24, R19
UMULH R0, R20, R24
LDP.P 16(R1), (R9, R10) MUL R0, R20
ADDS R4, R9 ADCS R23, R20
MUL R6, R3, R14 UMULH R0, R21, R23
ADCS R14, R10 MUL R0, R21
MUL R7, R3, R15 ADCS R24, R21
LDP.P 16(R1), (R11, R12) UMULH R0, R22, R24
ADCS R15, R11 MUL R0, R22
MUL R8, R3, R16 ADCS R23, R22
ADCS R16, R12 ADC ZR, R24, R1
UMULH R8, R3, R20 // add
ADC $0, R20 ADDS R6, R14
ADCS R7, R15
MUL R5, R3, R13 ADCS R8, R16
ADDS R13, R9 ADCS R9, R17
UMULH R5, R3, R17 ADCS R10, R19
ADCS R17, R10 ADCS R11, R20
UMULH R6, R3, R21 ADCS R12, R21
STP.P (R9, R10), 16(R22) ADCS R13, R22
ADCS R21, R11 ADC ZR, R1
UMULH R7, R3, R19 STP.P (R14, R15), 64(R5)
ADCS R19, R12 STP (R16, R17), -48(R5)
STP.P (R11, R12), 16(R22) STP (R19, R20), -32(R5)
ADC $0, R20, R4 STP (R21, R22), -16(R5)
SUB $1, R2
SUB $4, R0 CBNZ R2, loop8cont
B loop loop8done:
MOVD R1, c+88(FP)
done:
MOVD R4, c+88(FP)
RET RET

View File

@ -4,6 +4,8 @@
//go:build !math_big_pure_go //go:build !math_big_pure_go
//go:generate go test ./internal/asmgen -generate
package big package big
import _ "unsafe" // for linkname import _ "unsafe" // for linkname

View File

@ -1,82 +1,457 @@
// Copyright 2022 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
//go:build !math_big_pure_go && loong64 // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary // func addVV(z, x, y []Word) (c Word)
// arithmetic operations on vectors implemented in arith.go.
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
JMP ·addVV_g(SB) MOVV z_len+8(FP), R4
MOVV x_base+24(FP), R5
MOVV y_base+48(FP), R6
MOVV z_base+0(FP), R7
// compute unrolled loop lengths
AND $3, R4, R8
SRLV $2, R4
XOR R28, R28 // clear carry
loop1:
BEQ R8, loop1done
loop1cont:
// unroll 1X
MOVV 0(R5), R9
MOVV 0(R6), R10
ADDVU R10, R9 // ADCS R10, R9, R9 (cr=R28)
SGTU R10, R9, R30 // ...
ADDVU R28, R9 // ...
SGTU R28, R9, R28 // ...
ADDVU R30, R28 // ...
MOVV R9, 0(R7)
ADDVU $8, R5
ADDVU $8, R6
ADDVU $8, R7
SUBVU $1, R8
BNE R8, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 0(R5), R8
MOVV 8(R5), R9
MOVV 16(R5), R10
MOVV 24(R5), R11
MOVV 0(R6), R12
MOVV 8(R6), R13
MOVV 16(R6), R14
MOVV 24(R6), R15
ADDVU R12, R8 // ADCS R12, R8, R8 (cr=R28)
SGTU R12, R8, R30 // ...
ADDVU R28, R8 // ...
SGTU R28, R8, R28 // ...
ADDVU R30, R28 // ...
ADDVU R13, R9 // ADCS R13, R9, R9 (cr=R28)
SGTU R13, R9, R30 // ...
ADDVU R28, R9 // ...
SGTU R28, R9, R28 // ...
ADDVU R30, R28 // ...
ADDVU R14, R10 // ADCS R14, R10, R10 (cr=R28)
SGTU R14, R10, R30 // ...
ADDVU R28, R10 // ...
SGTU R28, R10, R28 // ...
ADDVU R30, R28 // ...
ADDVU R15, R11 // ADCS R15, R11, R11 (cr=R28)
SGTU R15, R11, R30 // ...
ADDVU R28, R11 // ...
SGTU R28, R11, R28 // ...
ADDVU R30, R28 // ...
MOVV R8, 0(R7)
MOVV R9, 8(R7)
MOVV R10, 16(R7)
MOVV R11, 24(R7)
ADDVU $32, R5
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
MOVV R28, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word) // func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
// input: MOVV z_len+8(FP), R4
// R4: z MOVV x_base+24(FP), R5
// R5: z_len MOVV y_base+48(FP), R6
// R7: x MOVV z_base+0(FP), R7
// R10: y // compute unrolled loop lengths
MOVV z+0(FP), R4 AND $3, R4, R8
MOVV z_len+8(FP), R5 SRLV $2, R4
MOVV x+24(FP), R7 XOR R28, R28 // clear carry
MOVV y+48(FP), R10 loop1:
MOVV $0, R6 BEQ R8, loop1done
SLLV $3, R5 loop1cont:
MOVV $0, R8 // unroll 1X
loop: MOVV 0(R5), R9
BEQ R5, R6, done MOVV 0(R6), R10
MOVV (R6)(R7), R9 SGTU R28, R9, R30 // SBCS R10, R9, R9
MOVV (R6)(R10), R11 SUBVU R28, R9 // ...
SUBV R11, R9, R11 // x1 - y1 = z1', if z1' > x1 then overflow SGTU R10, R9, R28 // ...
SUBV R8, R11, R12 // z1' - c0 = z1, if z1 > z1' then overflow SUBVU R10, R9 // ...
SGTU R11, R9, R9 ADDVU R30, R28 // ...
SGTU R12, R11, R11 MOVV R9, 0(R7)
MOVV R12, (R6)(R4) ADDVU $8, R5
OR R9, R11, R8 ADDVU $8, R6
ADDV $8, R6 ADDVU $8, R7
JMP loop SUBVU $1, R8
done: BNE R8, loop1cont
MOVV R8, c+72(FP) loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 0(R5), R8
MOVV 8(R5), R9
MOVV 16(R5), R10
MOVV 24(R5), R11
MOVV 0(R6), R12
MOVV 8(R6), R13
MOVV 16(R6), R14
MOVV 24(R6), R15
SGTU R28, R8, R30 // SBCS R12, R8, R8
SUBVU R28, R8 // ...
SGTU R12, R8, R28 // ...
SUBVU R12, R8 // ...
ADDVU R30, R28 // ...
SGTU R28, R9, R30 // SBCS R13, R9, R9
SUBVU R28, R9 // ...
SGTU R13, R9, R28 // ...
SUBVU R13, R9 // ...
ADDVU R30, R28 // ...
SGTU R28, R10, R30 // SBCS R14, R10, R10
SUBVU R28, R10 // ...
SGTU R14, R10, R28 // ...
SUBVU R14, R10 // ...
ADDVU R30, R28 // ...
SGTU R28, R11, R30 // SBCS R15, R11, R11
SUBVU R28, R11 // ...
SGTU R15, R11, R28 // ...
SUBVU R15, R11 // ...
ADDVU R30, R28 // ...
MOVV R8, 0(R7)
MOVV R9, 8(R7)
MOVV R10, 16(R7)
MOVV R11, 24(R7)
ADDVU $32, R5
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
MOVV R28, c+72(FP)
RET RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
JMP ·lshVU_g(SB) MOVV z_len+8(FP), R4
BEQ R4, ret0
MOVV s+48(FP), R5
MOVV x_base+24(FP), R6
MOVV z_base+0(FP), R7
// run loop backward
SLLV $3, R4, R8
ADDVU R8, R6
SLLV $3, R4, R8
ADDVU R8, R7
// shift first word into carry
MOVV -8(R6), R8
MOVV $64, R9
SUBVU R5, R9
SRLV R9, R8, R10
SLLV R5, R8
MOVV R10, c+56(FP)
// shift remaining words
SUBVU $1, R4
// compute unrolled loop lengths
AND $3, R4, R10
SRLV $2, R4
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV -16(R6), R11
SRLV R9, R11, R12
OR R8, R12
SLLV R5, R11, R8
MOVV R12, -8(R7)
ADDVU $-8, R6
ADDVU $-8, R7
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV -16(R6), R10
MOVV -24(R6), R11
MOVV -32(R6), R12
MOVV -40(R6), R13
SRLV R9, R10, R14
OR R8, R14
SLLV R5, R10, R8
SRLV R9, R11, R10
OR R8, R10
SLLV R5, R11, R8
SRLV R9, R12, R11
OR R8, R11
SLLV R5, R12, R8
SRLV R9, R13, R12
OR R8, R12
SLLV R5, R13, R8
MOVV R14, -8(R7)
MOVV R10, -16(R7)
MOVV R11, -24(R7)
MOVV R12, -32(R7)
ADDVU $-32, R6
ADDVU $-32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
// store final shifted bits
MOVV R8, -8(R7)
RET
ret0:
MOVV R0, c+56(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
JMP ·rshVU_g(SB) MOVV z_len+8(FP), R4
BEQ R4, ret0
MOVV s+48(FP), R5
MOVV x_base+24(FP), R6
MOVV z_base+0(FP), R7
// shift first word into carry
MOVV 0(R6), R8
MOVV $64, R9
SUBVU R5, R9
SLLV R9, R8, R10
SRLV R5, R8
MOVV R10, c+56(FP)
// shift remaining words
SUBVU $1, R4
// compute unrolled loop lengths
AND $3, R4, R10
SRLV $2, R4
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV 8(R6), R11
SLLV R9, R11, R12
OR R8, R12
SRLV R5, R11, R8
MOVV R12, 0(R7)
ADDVU $8, R6
ADDVU $8, R7
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 8(R6), R10
MOVV 16(R6), R11
MOVV 24(R6), R12
MOVV 32(R6), R13
SLLV R9, R10, R14
OR R8, R14
SRLV R5, R10, R8
SLLV R9, R11, R10
OR R8, R10
SRLV R5, R11, R8
SLLV R9, R12, R11
OR R8, R11
SRLV R5, R12, R8
SLLV R9, R13, R12
OR R8, R12
SRLV R5, R13, R8
MOVV R14, 0(R7)
MOVV R10, 8(R7)
MOVV R11, 16(R7)
MOVV R12, 24(R7)
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
// store final shifted bits
MOVV R8, 0(R7)
RET
ret0:
MOVV R0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word) // func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
// input: MOVV m+48(FP), R4
// R4: z MOVV a+56(FP), R5
// R5: z_len MOVV z_len+8(FP), R6
// R7: x MOVV x_base+24(FP), R7
// R10: m MOVV z_base+0(FP), R8
// R11: a // compute unrolled loop lengths
MOVV z+0(FP), R4 AND $3, R6, R9
MOVV z_len+8(FP), R5 SRLV $2, R6
MOVV x+24(FP), R7 loop1:
MOVV m+48(FP), R10 BEQ R9, loop1done
MOVV a+56(FP), R11 loop1cont:
SLLV $3, R5 // unroll 1X
MOVV $0, R6 MOVV 0(R7), R10
loop: // synthetic carry, one column at a time
BEQ R5, R6, done MULV R4, R10, R11
MOVV (R6)(R7), R8 MULHVU R4, R10, R12
MULV R8, R10, R9 ADDVU R5, R11, R10 // ADDS R5, R11, R10 (cr=R28)
MULHVU R8, R10, R12 SGTU R5, R10, R28 // ...
ADDV R9, R11, R8 ADDVU R28, R12, R5 // ADC $0, R12, R5
SGTU R9, R8, R11 // if (c' = lo + c) < lo then overflow MOVV R10, 0(R8)
MOVV R8, (R6)(R4) ADDVU $8, R7
ADDV R12, R11 ADDVU $8, R8
ADDV $8, R6 SUBVU $1, R9
JMP loop BNE R9, loop1cont
done: loop1done:
MOVV R11, c+64(FP) loop4:
BEQ R6, loop4done
loop4cont:
// unroll 4X
MOVV 0(R7), R9
MOVV 8(R7), R10
MOVV 16(R7), R11
MOVV 24(R7), R12
// synthetic carry, one column at a time
MULV R4, R9, R13
MULHVU R4, R9, R14
ADDVU R5, R13, R9 // ADDS R5, R13, R9 (cr=R28)
SGTU R5, R9, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R10, R13
MULHVU R4, R10, R14
ADDVU R5, R13, R10 // ADDS R5, R13, R10 (cr=R28)
SGTU R5, R10, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R11, R13
MULHVU R4, R11, R14
ADDVU R5, R13, R11 // ADDS R5, R13, R11 (cr=R28)
SGTU R5, R11, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R12, R13
MULHVU R4, R12, R14
ADDVU R5, R13, R12 // ADDS R5, R13, R12 (cr=R28)
SGTU R5, R12, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MOVV R9, 0(R8)
MOVV R10, 8(R8)
MOVV R11, 16(R8)
MOVV R12, 24(R8)
ADDVU $32, R7
ADDVU $32, R8
SUBVU $1, R6
BNE R6, loop4cont
loop4done:
MOVV R5, c+64(FP)
RET RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
JMP ·addMulVVWW_g(SB) MOVV m+72(FP), R4
MOVV a+80(FP), R5
MOVV z_len+8(FP), R6
MOVV x_base+24(FP), R7
MOVV y_base+48(FP), R8
MOVV z_base+0(FP), R9
// compute unrolled loop lengths
AND $3, R6, R10
SRLV $2, R6
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV 0(R7), R11
MOVV 0(R8), R12
// synthetic carry, one column at a time
MULV R4, R12, R13
MULHVU R4, R12, R14
ADDVU R11, R13 // ADDS R11, R13, R13 (cr=R28)
SGTU R11, R13, R28 // ...
ADDVU R28, R14 // ADC $0, R14, R14
ADDVU R5, R13, R12 // ADDS R5, R13, R12 (cr=R28)
SGTU R5, R12, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MOVV R12, 0(R9)
ADDVU $8, R7
ADDVU $8, R8
ADDVU $8, R9
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R6, loop4done
loop4cont:
// unroll 4X
MOVV 0(R7), R10
MOVV 8(R7), R11
MOVV 16(R7), R12
MOVV 24(R7), R13
MOVV 0(R8), R14
MOVV 8(R8), R15
MOVV 16(R8), R16
MOVV 24(R8), R17
// synthetic carry, one column at a time
MULV R4, R14, R18
MULHVU R4, R14, R19
ADDVU R10, R18 // ADDS R10, R18, R18 (cr=R28)
SGTU R10, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R14 // ADDS R5, R18, R14 (cr=R28)
SGTU R5, R14, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R15, R18
MULHVU R4, R15, R19
ADDVU R11, R18 // ADDS R11, R18, R18 (cr=R28)
SGTU R11, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R15 // ADDS R5, R18, R15 (cr=R28)
SGTU R5, R15, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R16, R18
MULHVU R4, R16, R19
ADDVU R12, R18 // ADDS R12, R18, R18 (cr=R28)
SGTU R12, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R16 // ADDS R5, R18, R16 (cr=R28)
SGTU R5, R16, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R17, R18
MULHVU R4, R17, R19
ADDVU R13, R18 // ADDS R13, R18, R18 (cr=R28)
SGTU R13, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R17 // ADDS R5, R18, R17 (cr=R28)
SGTU R5, R17, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MOVV R14, 0(R9)
MOVV R15, 8(R9)
MOVV R16, 16(R9)
MOVV R17, 24(R9)
ADDVU $32, R7
ADDVU $32, R8
ADDVU $32, R9
SUBVU $1, R6
BNE R6, loop4cont
loop4done:
MOVV R5, c+88(FP)
RET

View File

@ -1,29 +1,467 @@
// Copyright 2013 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go && (mips64 || mips64le) //go:build !math_big_pure_go && (mips64 || mips64le)
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary // func addVV(z, x, y []Word) (c Word)
// arithmetic operations on vectors implemented in arith.go.
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
JMP ·addVV_g(SB) MOVV z_len+8(FP), R1
MOVV x_base+24(FP), R2
MOVV y_base+48(FP), R3
MOVV z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R1, R5
SRLV $2, R1
XOR R26, R26 // clear carry
loop1:
BEQ R5, loop1done
loop1cont:
// unroll 1X
MOVV 0(R2), R6
MOVV 0(R3), R7
ADDVU R7, R6 // ADCS R7, R6, R6 (cr=R26)
SGTU R7, R6, R23 // ...
ADDVU R26, R6 // ...
SGTU R26, R6, R26 // ...
ADDVU R23, R26 // ...
MOVV R6, 0(R4)
ADDVU $8, R2
ADDVU $8, R3
ADDVU $8, R4
SUBVU $1, R5
BNE R5, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVV 0(R2), R5
MOVV 8(R2), R6
MOVV 16(R2), R7
MOVV 24(R2), R8
MOVV 0(R3), R9
MOVV 8(R3), R10
MOVV 16(R3), R11
MOVV 24(R3), R12
ADDVU R9, R5 // ADCS R9, R5, R5 (cr=R26)
SGTU R9, R5, R23 // ...
ADDVU R26, R5 // ...
SGTU R26, R5, R26 // ...
ADDVU R23, R26 // ...
ADDVU R10, R6 // ADCS R10, R6, R6 (cr=R26)
SGTU R10, R6, R23 // ...
ADDVU R26, R6 // ...
SGTU R26, R6, R26 // ...
ADDVU R23, R26 // ...
ADDVU R11, R7 // ADCS R11, R7, R7 (cr=R26)
SGTU R11, R7, R23 // ...
ADDVU R26, R7 // ...
SGTU R26, R7, R26 // ...
ADDVU R23, R26 // ...
ADDVU R12, R8 // ADCS R12, R8, R8 (cr=R26)
SGTU R12, R8, R23 // ...
ADDVU R26, R8 // ...
SGTU R26, R8, R26 // ...
ADDVU R23, R26 // ...
MOVV R5, 0(R4)
MOVV R6, 8(R4)
MOVV R7, 16(R4)
MOVV R8, 24(R4)
ADDVU $32, R2
ADDVU $32, R3
ADDVU $32, R4
SUBVU $1, R1
BNE R1, loop4cont
loop4done:
MOVV R26, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
JMP ·subVV_g(SB) MOVV z_len+8(FP), R1
MOVV x_base+24(FP), R2
MOVV y_base+48(FP), R3
MOVV z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R1, R5
SRLV $2, R1
XOR R26, R26 // clear carry
loop1:
BEQ R5, loop1done
loop1cont:
// unroll 1X
MOVV 0(R2), R6
MOVV 0(R3), R7
SGTU R26, R6, R23 // SBCS R7, R6, R6
SUBVU R26, R6 // ...
SGTU R7, R6, R26 // ...
SUBVU R7, R6 // ...
ADDVU R23, R26 // ...
MOVV R6, 0(R4)
ADDVU $8, R2
ADDVU $8, R3
ADDVU $8, R4
SUBVU $1, R5
BNE R5, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVV 0(R2), R5
MOVV 8(R2), R6
MOVV 16(R2), R7
MOVV 24(R2), R8
MOVV 0(R3), R9
MOVV 8(R3), R10
MOVV 16(R3), R11
MOVV 24(R3), R12
SGTU R26, R5, R23 // SBCS R9, R5, R5
SUBVU R26, R5 // ...
SGTU R9, R5, R26 // ...
SUBVU R9, R5 // ...
ADDVU R23, R26 // ...
SGTU R26, R6, R23 // SBCS R10, R6, R6
SUBVU R26, R6 // ...
SGTU R10, R6, R26 // ...
SUBVU R10, R6 // ...
ADDVU R23, R26 // ...
SGTU R26, R7, R23 // SBCS R11, R7, R7
SUBVU R26, R7 // ...
SGTU R11, R7, R26 // ...
SUBVU R11, R7 // ...
ADDVU R23, R26 // ...
SGTU R26, R8, R23 // SBCS R12, R8, R8
SUBVU R26, R8 // ...
SGTU R12, R8, R26 // ...
SUBVU R12, R8 // ...
ADDVU R23, R26 // ...
MOVV R5, 0(R4)
MOVV R6, 8(R4)
MOVV R7, 16(R4)
MOVV R8, 24(R4)
ADDVU $32, R2
ADDVU $32, R3
ADDVU $32, R4
SUBVU $1, R1
BNE R1, loop4cont
loop4done:
MOVV R26, c+72(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
JMP ·lshVU_g(SB) MOVV z_len+8(FP), R1
BEQ R1, ret0
MOVV s+48(FP), R2
MOVV x_base+24(FP), R3
MOVV z_base+0(FP), R4
// run loop backward
SLLV $3, R1, R5
ADDVU R5, R3
SLLV $3, R1, R5
ADDVU R5, R4
// shift first word into carry
MOVV -8(R3), R5
MOVV $64, R6
SUBVU R2, R6
SRLV R6, R5, R7
SLLV R2, R5
MOVV R7, c+56(FP)
// shift remaining words
SUBVU $1, R1
// compute unrolled loop lengths
AND $3, R1, R7
SRLV $2, R1
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVV -16(R3), R8
SRLV R6, R8, R9
OR R5, R9
SLLV R2, R8, R5
MOVV R9, -8(R4)
ADDVU $-8, R3
ADDVU $-8, R4
SUBVU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVV -16(R3), R7
MOVV -24(R3), R8
MOVV -32(R3), R9
MOVV -40(R3), R10
SRLV R6, R7, R11
OR R5, R11
SLLV R2, R7, R5
SRLV R6, R8, R7
OR R5, R7
SLLV R2, R8, R5
SRLV R6, R9, R8
OR R5, R8
SLLV R2, R9, R5
SRLV R6, R10, R9
OR R5, R9
SLLV R2, R10, R5
MOVV R11, -8(R4)
MOVV R7, -16(R4)
MOVV R8, -24(R4)
MOVV R9, -32(R4)
ADDVU $-32, R3
ADDVU $-32, R4
SUBVU $1, R1
BNE R1, loop4cont
loop4done:
// store final shifted bits
MOVV R5, -8(R4)
RET
ret0:
MOVV R0, c+56(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
JMP ·rshVU_g(SB) MOVV z_len+8(FP), R1
BEQ R1, ret0
MOVV s+48(FP), R2
MOVV x_base+24(FP), R3
MOVV z_base+0(FP), R4
// shift first word into carry
MOVV 0(R3), R5
MOVV $64, R6
SUBVU R2, R6
SLLV R6, R5, R7
SRLV R2, R5
MOVV R7, c+56(FP)
// shift remaining words
SUBVU $1, R1
// compute unrolled loop lengths
AND $3, R1, R7
SRLV $2, R1
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVV 8(R3), R8
SLLV R6, R8, R9
OR R5, R9
SRLV R2, R8, R5
MOVV R9, 0(R4)
ADDVU $8, R3
ADDVU $8, R4
SUBVU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVV 8(R3), R7
MOVV 16(R3), R8
MOVV 24(R3), R9
MOVV 32(R3), R10
SLLV R6, R7, R11
OR R5, R11
SRLV R2, R7, R5
SLLV R6, R8, R7
OR R5, R7
SRLV R2, R8, R5
SLLV R6, R9, R8
OR R5, R8
SRLV R2, R9, R5
SLLV R6, R10, R9
OR R5, R9
SRLV R2, R10, R5
MOVV R11, 0(R4)
MOVV R7, 8(R4)
MOVV R8, 16(R4)
MOVV R9, 24(R4)
ADDVU $32, R3
ADDVU $32, R4
SUBVU $1, R1
BNE R1, loop4cont
loop4done:
// store final shifted bits
MOVV R5, 0(R4)
RET
ret0:
MOVV R0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
JMP ·mulAddVWW_g(SB) MOVV m+48(FP), R1
MOVV a+56(FP), R2
MOVV z_len+8(FP), R3
MOVV x_base+24(FP), R4
MOVV z_base+0(FP), R5
// compute unrolled loop lengths
AND $3, R3, R6
SRLV $2, R3
loop1:
BEQ R6, loop1done
loop1cont:
// unroll 1X
MOVV 0(R4), R7
// synthetic carry, one column at a time
MULVU R1, R7
MOVV LO, R8
MOVV HI, R9
ADDVU R2, R8, R7 // ADDS R2, R8, R7 (cr=R26)
SGTU R2, R7, R26 // ...
ADDVU R26, R9, R2 // ADC $0, R9, R2
MOVV R7, 0(R5)
ADDVU $8, R4
ADDVU $8, R5
SUBVU $1, R6
BNE R6, loop1cont
loop1done:
loop4:
BEQ R3, loop4done
loop4cont:
// unroll 4X
MOVV 0(R4), R6
MOVV 8(R4), R7
MOVV 16(R4), R8
MOVV 24(R4), R9
// synthetic carry, one column at a time
MULVU R1, R6
MOVV LO, R10
MOVV HI, R11
ADDVU R2, R10, R6 // ADDS R2, R10, R6 (cr=R26)
SGTU R2, R6, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MULVU R1, R7
MOVV LO, R10
MOVV HI, R11
ADDVU R2, R10, R7 // ADDS R2, R10, R7 (cr=R26)
SGTU R2, R7, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MULVU R1, R8
MOVV LO, R10
MOVV HI, R11
ADDVU R2, R10, R8 // ADDS R2, R10, R8 (cr=R26)
SGTU R2, R8, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MULVU R1, R9
MOVV LO, R10
MOVV HI, R11
ADDVU R2, R10, R9 // ADDS R2, R10, R9 (cr=R26)
SGTU R2, R9, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MOVV R6, 0(R5)
MOVV R7, 8(R5)
MOVV R8, 16(R5)
MOVV R9, 24(R5)
ADDVU $32, R4
ADDVU $32, R5
SUBVU $1, R3
BNE R3, loop4cont
loop4done:
MOVV R2, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
JMP ·addMulVVWW_g(SB) MOVV m+72(FP), R1
MOVV a+80(FP), R2
MOVV z_len+8(FP), R3
MOVV x_base+24(FP), R4
MOVV y_base+48(FP), R5
MOVV z_base+0(FP), R6
// compute unrolled loop lengths
AND $3, R3, R7
SRLV $2, R3
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVV 0(R4), R8
MOVV 0(R5), R9
// synthetic carry, one column at a time
MULVU R1, R9
MOVV LO, R10
MOVV HI, R11
ADDVU R8, R10 // ADDS R8, R10, R10 (cr=R26)
SGTU R8, R10, R26 // ...
ADDVU R26, R11 // ADC $0, R11, R11
ADDVU R2, R10, R9 // ADDS R2, R10, R9 (cr=R26)
SGTU R2, R9, R26 // ...
ADDVU R26, R11, R2 // ADC $0, R11, R2
MOVV R9, 0(R6)
ADDVU $8, R4
ADDVU $8, R5
ADDVU $8, R6
SUBVU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R3, loop4done
loop4cont:
// unroll 4X
MOVV 0(R4), R7
MOVV 8(R4), R8
MOVV 16(R4), R9
MOVV 24(R4), R10
MOVV 0(R5), R11
MOVV 8(R5), R12
MOVV 16(R5), R13
MOVV 24(R5), R14
// synthetic carry, one column at a time
MULVU R1, R11
MOVV LO, R15
MOVV HI, R16
ADDVU R7, R15 // ADDS R7, R15, R15 (cr=R26)
SGTU R7, R15, R26 // ...
ADDVU R26, R16 // ADC $0, R16, R16
ADDVU R2, R15, R11 // ADDS R2, R15, R11 (cr=R26)
SGTU R2, R11, R26 // ...
ADDVU R26, R16, R2 // ADC $0, R16, R2
MULVU R1, R12
MOVV LO, R15
MOVV HI, R16
ADDVU R8, R15 // ADDS R8, R15, R15 (cr=R26)
SGTU R8, R15, R26 // ...
ADDVU R26, R16 // ADC $0, R16, R16
ADDVU R2, R15, R12 // ADDS R2, R15, R12 (cr=R26)
SGTU R2, R12, R26 // ...
ADDVU R26, R16, R2 // ADC $0, R16, R2
MULVU R1, R13
MOVV LO, R15
MOVV HI, R16
ADDVU R9, R15 // ADDS R9, R15, R15 (cr=R26)
SGTU R9, R15, R26 // ...
ADDVU R26, R16 // ADC $0, R16, R16
ADDVU R2, R15, R13 // ADDS R2, R15, R13 (cr=R26)
SGTU R2, R13, R26 // ...
ADDVU R26, R16, R2 // ADC $0, R16, R2
MULVU R1, R14
MOVV LO, R15
MOVV HI, R16
ADDVU R10, R15 // ADDS R10, R15, R15 (cr=R26)
SGTU R10, R15, R26 // ...
ADDVU R26, R16 // ADC $0, R16, R16
ADDVU R2, R15, R14 // ADDS R2, R15, R14 (cr=R26)
SGTU R2, R14, R26 // ...
ADDVU R26, R16, R2 // ADC $0, R16, R2
MOVV R11, 0(R6)
MOVV R12, 8(R6)
MOVV R13, 16(R6)
MOVV R14, 24(R6)
ADDVU $32, R4
ADDVU $32, R5
ADDVU $32, R6
SUBVU $1, R3
BNE R3, loop4cont
loop4done:
MOVV R2, c+88(FP)
RET

View File

@ -1,29 +1,467 @@
// Copyright 2016 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go && (mips || mipsle) //go:build !math_big_pure_go && (mips || mipsle)
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary // func addVV(z, x, y []Word) (c Word)
// arithmetic operations on vectors implemented in arith.go.
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
JMP ·addVV_g(SB) MOVW z_len+4(FP), R1
MOVW x_base+12(FP), R2
MOVW y_base+24(FP), R3
MOVW z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R1, R5
SRL $2, R1
XOR R26, R26 // clear carry
loop1:
BEQ R5, loop1done
loop1cont:
// unroll 1X
MOVW 0(R2), R6
MOVW 0(R3), R7
ADDU R7, R6 // ADCS R7, R6, R6 (cr=R26)
SGTU R7, R6, R23 // ...
ADDU R26, R6 // ...
SGTU R26, R6, R26 // ...
ADDU R23, R26 // ...
MOVW R6, 0(R4)
ADDU $4, R2
ADDU $4, R3
ADDU $4, R4
SUBU $1, R5
BNE R5, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVW 0(R2), R5
MOVW 4(R2), R6
MOVW 8(R2), R7
MOVW 12(R2), R8
MOVW 0(R3), R9
MOVW 4(R3), R10
MOVW 8(R3), R11
MOVW 12(R3), R12
ADDU R9, R5 // ADCS R9, R5, R5 (cr=R26)
SGTU R9, R5, R23 // ...
ADDU R26, R5 // ...
SGTU R26, R5, R26 // ...
ADDU R23, R26 // ...
ADDU R10, R6 // ADCS R10, R6, R6 (cr=R26)
SGTU R10, R6, R23 // ...
ADDU R26, R6 // ...
SGTU R26, R6, R26 // ...
ADDU R23, R26 // ...
ADDU R11, R7 // ADCS R11, R7, R7 (cr=R26)
SGTU R11, R7, R23 // ...
ADDU R26, R7 // ...
SGTU R26, R7, R26 // ...
ADDU R23, R26 // ...
ADDU R12, R8 // ADCS R12, R8, R8 (cr=R26)
SGTU R12, R8, R23 // ...
ADDU R26, R8 // ...
SGTU R26, R8, R26 // ...
ADDU R23, R26 // ...
MOVW R5, 0(R4)
MOVW R6, 4(R4)
MOVW R7, 8(R4)
MOVW R8, 12(R4)
ADDU $16, R2
ADDU $16, R3
ADDU $16, R4
SUBU $1, R1
BNE R1, loop4cont
loop4done:
MOVW R26, c+36(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
JMP ·subVV_g(SB) MOVW z_len+4(FP), R1
MOVW x_base+12(FP), R2
MOVW y_base+24(FP), R3
MOVW z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R1, R5
SRL $2, R1
XOR R26, R26 // clear carry
loop1:
BEQ R5, loop1done
loop1cont:
// unroll 1X
MOVW 0(R2), R6
MOVW 0(R3), R7
SGTU R26, R6, R23 // SBCS R7, R6, R6
SUBU R26, R6 // ...
SGTU R7, R6, R26 // ...
SUBU R7, R6 // ...
ADDU R23, R26 // ...
MOVW R6, 0(R4)
ADDU $4, R2
ADDU $4, R3
ADDU $4, R4
SUBU $1, R5
BNE R5, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVW 0(R2), R5
MOVW 4(R2), R6
MOVW 8(R2), R7
MOVW 12(R2), R8
MOVW 0(R3), R9
MOVW 4(R3), R10
MOVW 8(R3), R11
MOVW 12(R3), R12
SGTU R26, R5, R23 // SBCS R9, R5, R5
SUBU R26, R5 // ...
SGTU R9, R5, R26 // ...
SUBU R9, R5 // ...
ADDU R23, R26 // ...
SGTU R26, R6, R23 // SBCS R10, R6, R6
SUBU R26, R6 // ...
SGTU R10, R6, R26 // ...
SUBU R10, R6 // ...
ADDU R23, R26 // ...
SGTU R26, R7, R23 // SBCS R11, R7, R7
SUBU R26, R7 // ...
SGTU R11, R7, R26 // ...
SUBU R11, R7 // ...
ADDU R23, R26 // ...
SGTU R26, R8, R23 // SBCS R12, R8, R8
SUBU R26, R8 // ...
SGTU R12, R8, R26 // ...
SUBU R12, R8 // ...
ADDU R23, R26 // ...
MOVW R5, 0(R4)
MOVW R6, 4(R4)
MOVW R7, 8(R4)
MOVW R8, 12(R4)
ADDU $16, R2
ADDU $16, R3
ADDU $16, R4
SUBU $1, R1
BNE R1, loop4cont
loop4done:
MOVW R26, c+36(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
JMP ·lshVU_g(SB) MOVW z_len+4(FP), R1
BEQ R1, ret0
MOVW s+24(FP), R2
MOVW x_base+12(FP), R3
MOVW z_base+0(FP), R4
// run loop backward
SLL $2, R1, R5
ADDU R5, R3
SLL $2, R1, R5
ADDU R5, R4
// shift first word into carry
MOVW -4(R3), R5
MOVW $32, R6
SUBU R2, R6
SRL R6, R5, R7
SLL R2, R5
MOVW R7, c+28(FP)
// shift remaining words
SUBU $1, R1
// compute unrolled loop lengths
AND $3, R1, R7
SRL $2, R1
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVW -8(R3), R8
SRL R6, R8, R9
OR R5, R9
SLL R2, R8, R5
MOVW R9, -4(R4)
ADDU $-4, R3
ADDU $-4, R4
SUBU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVW -8(R3), R7
MOVW -12(R3), R8
MOVW -16(R3), R9
MOVW -20(R3), R10
SRL R6, R7, R11
OR R5, R11
SLL R2, R7, R5
SRL R6, R8, R7
OR R5, R7
SLL R2, R8, R5
SRL R6, R9, R8
OR R5, R8
SLL R2, R9, R5
SRL R6, R10, R9
OR R5, R9
SLL R2, R10, R5
MOVW R11, -4(R4)
MOVW R7, -8(R4)
MOVW R8, -12(R4)
MOVW R9, -16(R4)
ADDU $-16, R3
ADDU $-16, R4
SUBU $1, R1
BNE R1, loop4cont
loop4done:
// store final shifted bits
MOVW R5, -4(R4)
RET
ret0:
MOVW R0, c+28(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
JMP ·rshVU_g(SB) MOVW z_len+4(FP), R1
BEQ R1, ret0
MOVW s+24(FP), R2
MOVW x_base+12(FP), R3
MOVW z_base+0(FP), R4
// shift first word into carry
MOVW 0(R3), R5
MOVW $32, R6
SUBU R2, R6
SLL R6, R5, R7
SRL R2, R5
MOVW R7, c+28(FP)
// shift remaining words
SUBU $1, R1
// compute unrolled loop lengths
AND $3, R1, R7
SRL $2, R1
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVW 4(R3), R8
SLL R6, R8, R9
OR R5, R9
SRL R2, R8, R5
MOVW R9, 0(R4)
ADDU $4, R3
ADDU $4, R4
SUBU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R1, loop4done
loop4cont:
// unroll 4X
MOVW 4(R3), R7
MOVW 8(R3), R8
MOVW 12(R3), R9
MOVW 16(R3), R10
SLL R6, R7, R11
OR R5, R11
SRL R2, R7, R5
SLL R6, R8, R7
OR R5, R7
SRL R2, R8, R5
SLL R6, R9, R8
OR R5, R8
SRL R2, R9, R5
SLL R6, R10, R9
OR R5, R9
SRL R2, R10, R5
MOVW R11, 0(R4)
MOVW R7, 4(R4)
MOVW R8, 8(R4)
MOVW R9, 12(R4)
ADDU $16, R3
ADDU $16, R4
SUBU $1, R1
BNE R1, loop4cont
loop4done:
// store final shifted bits
MOVW R5, 0(R4)
RET
ret0:
MOVW R0, c+28(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
JMP ·mulAddVWW_g(SB) MOVW m+24(FP), R1
MOVW a+28(FP), R2
MOVW z_len+4(FP), R3
MOVW x_base+12(FP), R4
MOVW z_base+0(FP), R5
// compute unrolled loop lengths
AND $3, R3, R6
SRL $2, R3
loop1:
BEQ R6, loop1done
loop1cont:
// unroll 1X
MOVW 0(R4), R7
// synthetic carry, one column at a time
MULU R1, R7
MOVW LO, R8
MOVW HI, R9
ADDU R2, R8, R7 // ADDS R2, R8, R7 (cr=R26)
SGTU R2, R7, R26 // ...
ADDU R26, R9, R2 // ADC $0, R9, R2
MOVW R7, 0(R5)
ADDU $4, R4
ADDU $4, R5
SUBU $1, R6
BNE R6, loop1cont
loop1done:
loop4:
BEQ R3, loop4done
loop4cont:
// unroll 4X
MOVW 0(R4), R6
MOVW 4(R4), R7
MOVW 8(R4), R8
MOVW 12(R4), R9
// synthetic carry, one column at a time
MULU R1, R6
MOVW LO, R10
MOVW HI, R11
ADDU R2, R10, R6 // ADDS R2, R10, R6 (cr=R26)
SGTU R2, R6, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MULU R1, R7
MOVW LO, R10
MOVW HI, R11
ADDU R2, R10, R7 // ADDS R2, R10, R7 (cr=R26)
SGTU R2, R7, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MULU R1, R8
MOVW LO, R10
MOVW HI, R11
ADDU R2, R10, R8 // ADDS R2, R10, R8 (cr=R26)
SGTU R2, R8, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MULU R1, R9
MOVW LO, R10
MOVW HI, R11
ADDU R2, R10, R9 // ADDS R2, R10, R9 (cr=R26)
SGTU R2, R9, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MOVW R6, 0(R5)
MOVW R7, 4(R5)
MOVW R8, 8(R5)
MOVW R9, 12(R5)
ADDU $16, R4
ADDU $16, R5
SUBU $1, R3
BNE R3, loop4cont
loop4done:
MOVW R2, c+32(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
JMP ·addMulVVWW_g(SB) MOVW m+36(FP), R1
MOVW a+40(FP), R2
MOVW z_len+4(FP), R3
MOVW x_base+12(FP), R4
MOVW y_base+24(FP), R5
MOVW z_base+0(FP), R6
// compute unrolled loop lengths
AND $3, R3, R7
SRL $2, R3
loop1:
BEQ R7, loop1done
loop1cont:
// unroll 1X
MOVW 0(R4), R8
MOVW 0(R5), R9
// synthetic carry, one column at a time
MULU R1, R9
MOVW LO, R10
MOVW HI, R11
ADDU R8, R10 // ADDS R8, R10, R10 (cr=R26)
SGTU R8, R10, R26 // ...
ADDU R26, R11 // ADC $0, R11, R11
ADDU R2, R10, R9 // ADDS R2, R10, R9 (cr=R26)
SGTU R2, R9, R26 // ...
ADDU R26, R11, R2 // ADC $0, R11, R2
MOVW R9, 0(R6)
ADDU $4, R4
ADDU $4, R5
ADDU $4, R6
SUBU $1, R7
BNE R7, loop1cont
loop1done:
loop4:
BEQ R3, loop4done
loop4cont:
// unroll 4X
MOVW 0(R4), R7
MOVW 4(R4), R8
MOVW 8(R4), R9
MOVW 12(R4), R10
MOVW 0(R5), R11
MOVW 4(R5), R12
MOVW 8(R5), R13
MOVW 12(R5), R14
// synthetic carry, one column at a time
MULU R1, R11
MOVW LO, R15
MOVW HI, R16
ADDU R7, R15 // ADDS R7, R15, R15 (cr=R26)
SGTU R7, R15, R26 // ...
ADDU R26, R16 // ADC $0, R16, R16
ADDU R2, R15, R11 // ADDS R2, R15, R11 (cr=R26)
SGTU R2, R11, R26 // ...
ADDU R26, R16, R2 // ADC $0, R16, R2
MULU R1, R12
MOVW LO, R15
MOVW HI, R16
ADDU R8, R15 // ADDS R8, R15, R15 (cr=R26)
SGTU R8, R15, R26 // ...
ADDU R26, R16 // ADC $0, R16, R16
ADDU R2, R15, R12 // ADDS R2, R15, R12 (cr=R26)
SGTU R2, R12, R26 // ...
ADDU R26, R16, R2 // ADC $0, R16, R2
MULU R1, R13
MOVW LO, R15
MOVW HI, R16
ADDU R9, R15 // ADDS R9, R15, R15 (cr=R26)
SGTU R9, R15, R26 // ...
ADDU R26, R16 // ADC $0, R16, R16
ADDU R2, R15, R13 // ADDS R2, R15, R13 (cr=R26)
SGTU R2, R13, R26 // ...
ADDU R26, R16, R2 // ADC $0, R16, R2
MULU R1, R14
MOVW LO, R15
MOVW HI, R16
ADDU R10, R15 // ADDS R10, R15, R15 (cr=R26)
SGTU R10, R15, R26 // ...
ADDU R26, R16 // ADC $0, R16, R16
ADDU R2, R15, R14 // ADDS R2, R15, R14 (cr=R26)
SGTU R2, R14, R26 // ...
ADDU R26, R16, R2 // ADC $0, R16, R2
MOVW R11, 0(R6)
MOVW R12, 4(R6)
MOVW R13, 8(R6)
MOVW R14, 12(R6)
ADDU $16, R4
ADDU $16, R5
ADDU $16, R6
SUBU $1, R3
BNE R3, loop4cont
loop4done:
MOVW R2, c+44(FP)
RET

View File

@ -1,469 +1,386 @@
// Copyright 2013 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go && (ppc64 || ppc64le) //go:build !math_big_pure_go && (ppc64 || ppc64le)
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary // func addVV(z, x, y []Word) (c Word)
// arithmetic operations on vectors implemented in arith.go.
// func addVV(z, y, y []Word) (c Word)
// z[i] = x[i] + y[i] for all i, carrying
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7 // R7 = z_len MOVD z_len+8(FP), R3
MOVD x+24(FP), R8 // R8 = x[] MOVD x_base+24(FP), R4
MOVD y+48(FP), R9 // R9 = y[] MOVD y_base+48(FP), R5
MOVD z+0(FP), R10 // R10 = z[] MOVD z_base+0(FP), R6
// compute unrolled loop lengths
// If z_len = 0, we are done ANDCC $3, R3, R7
CMP R7, $0 SRD $2, R3
MOVD R0, R4 ADDC R0, R3 // clear carry
BEQ done loop1:
CMP R7, $0; BEQ loop1done; MOVD R7, CTR
// Process the first iteration out of the loop so we can loop1cont:
// use MOVDU and avoid 3 index registers updates. // unroll 1X
MOVD 0(R8), R11 // R11 = x[i] MOVD 0(R4), R8
MOVD 0(R9), R12 // R12 = y[i] MOVD 0(R5), R9
ADD $-1, R7 // R7 = z_len - 1 ADDE R9, R8
ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA MOVD R8, 0(R6)
CMP R7, $0 ADD $8, R4
MOVD R15, 0(R10) // z[i] ADD $8, R5
BEQ final // If z_len was 1, we are done ADD $8, R6
BDNZ loop1cont
SRD $2, R7, R5 // R5 = z_len/4 loop1done:
CMP R5, $0 loop4:
MOVD R5, CTR // Set up loop counter CMP R3, $0; BEQ loop4done; MOVD R3, CTR
BEQ tail // If R5 = 0, we can't use the loop loop4cont:
// unroll 4X
// Process 4 elements per iteration. Unrolling this loop MOVD 0(R4), R7
// means a performance trade-off: we will lose performance MOVD 8(R4), R8
// for small values of z_len (0.90x in the worst case), but MOVD 16(R4), R9
// gain significant performance as z_len increases (up to MOVD 24(R4), R10
// 1.45x). MOVD 0(R5), R11
MOVD 8(R5), R12
PCALIGN $16 MOVD 16(R5), R14
loop: MOVD 24(R5), R15
MOVD 8(R8), R11 // R11 = x[i] ADDE R11, R7
MOVD 16(R8), R12 // R12 = x[i+1] ADDE R12, R8
MOVD 24(R8), R14 // R14 = x[i+2] ADDE R14, R9
MOVDU 32(R8), R15 // R15 = x[i+3] ADDE R15, R10
MOVD 8(R9), R16 // R16 = y[i] MOVD R7, 0(R6)
MOVD 16(R9), R17 // R17 = y[i+1] MOVD R8, 8(R6)
MOVD 24(R9), R18 // R18 = y[i+2] MOVD R9, 16(R6)
MOVDU 32(R9), R19 // R19 = y[i+3] MOVD R10, 24(R6)
ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA ADD $32, R4
ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA ADD $32, R5
ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA ADD $32, R6
ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA BDNZ loop4cont
MOVD R20, 8(R10) // z[i] loop4done:
MOVD R21, 16(R10) // z[i+1] ADDE R0, R0, R4 // save & convert add carry
MOVD R22, 24(R10) // z[i+2]
MOVDU R23, 32(R10) // z[i+3]
ADD $-4, R7 // R7 = z_len - 4
BDNZ loop
// We may have more elements to read
CMP R7, $0
BEQ final
// Process the remaining elements, one at a time
tail:
MOVDU 8(R8), R11 // R11 = x[i]
MOVDU 8(R9), R16 // R16 = y[i]
ADD $-1, R7 // R7 = z_len - 1
ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
CMP R7, $0
MOVDU R20, 8(R10) // z[i]
BEQ final // If R7 = 0, we are done
MOVDU 8(R8), R11
MOVDU 8(R9), R16
ADD $-1, R7
ADDE R11, R16, R20
CMP R7, $0
MOVDU R20, 8(R10)
BEQ final
MOVD 8(R8), R11
MOVD 8(R9), R16
ADDE R11, R16, R20
MOVD R20, 8(R10)
final:
ADDZE R4 // Capture CA
done:
MOVD R4, c+72(FP) MOVD R4, c+72(FP)
RET RET
// func subVV(z, x, y []Word) (c Word) // func subVV(z, x, y []Word) (c Word)
// z[i] = x[i] - y[i] for all i, carrying
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7 // R7 = z_len MOVD z_len+8(FP), R3
MOVD x+24(FP), R8 // R8 = x[] MOVD x_base+24(FP), R4
MOVD y+48(FP), R9 // R9 = y[] MOVD y_base+48(FP), R5
MOVD z+0(FP), R10 // R10 = z[] MOVD z_base+0(FP), R6
// compute unrolled loop lengths
// If z_len = 0, we are done ANDCC $3, R3, R7
CMP R7, $0 SRD $2, R3
MOVD R0, R4 SUBC R0, R3 // clear carry
BEQ done loop1:
CMP R7, $0; BEQ loop1done; MOVD R7, CTR
// Process the first iteration out of the loop so we can loop1cont:
// use MOVDU and avoid 3 index registers updates. // unroll 1X
MOVD 0(R8), R11 // R11 = x[i] MOVD 0(R4), R8
MOVD 0(R9), R12 // R12 = y[i] MOVD 0(R5), R9
ADD $-1, R7 // R7 = z_len - 1 SUBE R9, R8
SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA MOVD R8, 0(R6)
CMP R7, $0 ADD $8, R4
MOVD R15, 0(R10) // z[i] ADD $8, R5
BEQ final // If z_len was 1, we are done ADD $8, R6
BDNZ loop1cont
SRD $2, R7, R5 // R5 = z_len/4 loop1done:
CMP R5, $0 loop4:
MOVD R5, CTR // Set up loop counter CMP R3, $0; BEQ loop4done; MOVD R3, CTR
BEQ tail // If R5 = 0, we can't use the loop loop4cont:
// unroll 4X
// Process 4 elements per iteration. Unrolling this loop MOVD 0(R4), R7
// means a performance trade-off: we will lose performance MOVD 8(R4), R8
// for small values of z_len (0.92x in the worst case), but MOVD 16(R4), R9
// gain significant performance as z_len increases (up to MOVD 24(R4), R10
// 1.45x). MOVD 0(R5), R11
MOVD 8(R5), R12
PCALIGN $16 MOVD 16(R5), R14
loop: MOVD 24(R5), R15
MOVD 8(R8), R11 // R11 = x[i] SUBE R11, R7
MOVD 16(R8), R12 // R12 = x[i+1] SUBE R12, R8
MOVD 24(R8), R14 // R14 = x[i+2] SUBE R14, R9
MOVDU 32(R8), R15 // R15 = x[i+3] SUBE R15, R10
MOVD 8(R9), R16 // R16 = y[i] MOVD R7, 0(R6)
MOVD 16(R9), R17 // R17 = y[i+1] MOVD R8, 8(R6)
MOVD 24(R9), R18 // R18 = y[i+2] MOVD R9, 16(R6)
MOVDU 32(R9), R19 // R19 = y[i+3] MOVD R10, 24(R6)
SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA ADD $32, R4
SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA ADD $32, R5
SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA ADD $32, R6
SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA BDNZ loop4cont
MOVD R20, 8(R10) // z[i] loop4done:
MOVD R21, 16(R10) // z[i+1] SUBE R4, R4 // save carry
MOVD R22, 24(R10) // z[i+2] SUB R4, R0, R4 // convert sub carry
MOVDU R23, 32(R10) // z[i+3]
ADD $-4, R7 // R7 = z_len - 4
BDNZ loop
// We may have more elements to read
CMP R7, $0
BEQ final
// Process the remaining elements, one at a time
tail:
MOVDU 8(R8), R11 // R11 = x[i]
MOVDU 8(R9), R16 // R16 = y[i]
ADD $-1, R7 // R7 = z_len - 1
SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
CMP R7, $0
MOVDU R20, 8(R10) // z[i]
BEQ final // If R7 = 0, we are done
MOVDU 8(R8), R11
MOVDU 8(R9), R16
ADD $-1, R7
SUBE R16, R11, R20
CMP R7, $0
MOVDU R20, 8(R10)
BEQ final
MOVD 8(R8), R11
MOVD 8(R9), R16
SUBE R16, R11, R20
MOVD R20, 8(R10)
final:
ADDZE R4
XOR $1, R4
done:
MOVD R4, c+72(FP) MOVD R4, c+72(FP)
RET RET
// func lshVU(z, x []Word, s uint) (c Word) // func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
MOVD z+0(FP), R3 MOVD z_len+8(FP), R3
MOVD x+24(FP), R6 CMP R3, $0; BEQ ret0
MOVD s+48(FP), R9 MOVD s+48(FP), R4
MOVD z_len+8(FP), R4 MOVD x_base+24(FP), R5
MOVD x_len+32(FP), R7 MOVD z_base+0(FP), R6
CMP R4, $0 // len(z)==0 return // run loop backward
BEQ done SLD $3, R3, R7
ADD R7, R5
ADD $-1, R4, R5 // len(z)-1 SLD $3, R3, R7
SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) ADD R7, R6
SLD $3, R5, R7 // shift first word into carry
ADD R6, R7, R15 // save starting address &x[len(z)-1] MOVD -8(R5), R7
ADD R3, R7, R16 // save starting address &z[len(z)-1] MOVD $64, R8
MOVD (R6)(R7), R14 SUB R4, R8
SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7 SRD R8, R7, R9
CMP R5, $0 // iterate from i=len(z)-1 to 0 SLD R4, R7
BEQ loopexit // Already at end? MOVD R9, c+56(FP)
MOVD 0(R15),R10 // x[i] // shift remaining words
PCALIGN $16 SUB $1, R3
shloop: // compute unrolled loop lengths
SLD R9, R10, R10 // x[i]<<s ANDCC $3, R3, R9
MOVDU -8(R15), R14 SRD $2, R3
SRD R4, R14, R11 // x[i-1]>>ŝ loop1:
OR R11, R10, R10 CMP R9, $0; BEQ loop1done; MOVD R9, CTR
MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ loop1cont:
MOVD R14, R10 // reuse x[i-1] for next iteration // unroll 1X
ADD $-8, R16 // i-- MOVD -16(R5), R10
CMP R15, R6 // &x[i-1]>&x[0]? SRD R8, R10, R11
BGT shloop OR R7, R11
loopexit: SLD R4, R10, R7
MOVD 0(R6), R4 MOVD R11, -8(R6)
SLD R9, R4, R4 ADD $-8, R5
MOVD R4, 0(R3) // z[0]=x[0]<<s ADD $-8, R6
MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c BDNZ loop1cont
loop1done:
loop4:
CMP R3, $0; BEQ loop4done; MOVD R3, CTR
loop4cont:
// unroll 4X
MOVD -16(R5), R9
MOVD -24(R5), R10
MOVD -32(R5), R11
MOVD -40(R5), R12
SRD R8, R9, R14
OR R7, R14
SLD R4, R9, R7
SRD R8, R10, R9
OR R7, R9
SLD R4, R10, R7
SRD R8, R11, R10
OR R7, R10
SLD R4, R11, R7
SRD R8, R12, R11
OR R7, R11
SLD R4, R12, R7
MOVD R14, -8(R6)
MOVD R9, -16(R6)
MOVD R10, -24(R6)
MOVD R11, -32(R6)
ADD $-32, R5
ADD $-32, R6
BDNZ loop4cont
loop4done:
// store final shifted bits
MOVD R7, -8(R6)
RET RET
done: ret0:
MOVD R0, c+56(FP) // c=0 MOVD R0, c+56(FP)
RET RET
// func rshVU(z, x []Word, s uint) (c Word) // func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
MOVD z+0(FP), R3 MOVD z_len+8(FP), R3
MOVD x+24(FP), R6 CMP R3, $0; BEQ ret0
MOVD s+48(FP), R9 MOVD s+48(FP), R4
MOVD z_len+8(FP), R4 MOVD x_base+24(FP), R5
MOVD x_len+32(FP), R7 MOVD z_base+0(FP), R6
// shift first word into carry
CMP R4, $0 // len(z)==0 return MOVD 0(R5), R7
BEQ done MOVD $64, R8
SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) SUB R4, R8
SLD R8, R7, R9
MOVD 0(R6), R7 SRD R4, R7
SLD R5, R7, R7 // compute x[0]<<ŝ MOVD R9, c+56(FP)
MOVD $1, R8 // iterate from i=1 to i<len(z) // shift remaining words
CMP R8, R4 SUB $1, R3
BGE loopexit // Already at end? // compute unrolled loop lengths
ANDCC $3, R3, R9
// vectorize if len(z) is >=3, else jump to scalar loop SRD $2, R3
CMP R4, $3 loop1:
BLT scalar CMP R9, $0; BEQ loop1done; MOVD R9, CTR
MTVSRD R9, VS38 // s loop1cont:
VSPLTB $7, V6, V4 // unroll 1X
MTVSRD R5, VS39 // ŝ MOVD 8(R5), R10
VSPLTB $7, V7, V2 SLD R8, R10, R11
ADD $-2, R4, R16 OR R7, R11
PCALIGN $16 SRD R4, R10, R7
loopback: MOVD R11, 0(R6)
ADD $-1, R8, R10 ADD $8, R5
SLD $3, R10 ADD $8, R6
LXVD2X (R6)(R10), VS32 // load x[i-1], x[i] BDNZ loop1cont
SLD $3, R8, R12 loop1done:
LXVD2X (R6)(R12), VS33 // load x[i], x[i+1] loop4:
CMP R3, $0; BEQ loop4done; MOVD R3, CTR
VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s loop4cont:
VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ // unroll 4X
VOR V3, V5, V5 // Or(|) the two registers together MOVD 8(R5), R9
STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i] MOVD 16(R5), R10
ADD $2, R8 // Done processing 2 entries, i and i+1 MOVD 24(R5), R11
CMP R8, R16 // Are there at least a couple of more entries left? MOVD 32(R5), R12
BLE loopback SLD R8, R9, R14
CMP R8, R4 // Are we at the last element? OR R7, R14
BEQ loopexit SRD R4, R9, R7
scalar: SLD R8, R10, R9
ADD $-1, R8, R10 OR R7, R9
SLD $3, R10 SRD R4, R10, R7
MOVD (R6)(R10),R11 SLD R8, R11, R10
SRD R9, R11, R11 // x[len(z)-2] >> s OR R7, R10
SLD $3, R8, R12 SRD R4, R11, R7
MOVD (R6)(R12), R12 SLD R8, R12, R11
SLD R5, R12, R12 // x[len(z)-1]<<ŝ OR R7, R11
OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ SRD R4, R12, R7
MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ MOVD R14, 0(R6)
loopexit: MOVD R9, 8(R6)
ADD $-1, R4 MOVD R10, 16(R6)
SLD $3, R4 MOVD R11, 24(R6)
MOVD (R6)(R4), R5 ADD $32, R5
SRD R9, R5, R5 // x[len(z)-1]>>s ADD $32, R6
MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s BDNZ loop4cont
MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c loop4done:
// store final shifted bits
MOVD R7, 0(R6)
RET RET
done: ret0:
MOVD R0, c+56(FP) MOVD R0, c+56(FP)
RET RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word) // func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[] MOVD m+48(FP), R3
MOVD x+24(FP), R8 // R8 = x[] MOVD a+56(FP), R4
MOVD m+48(FP), R9 // R9 = m MOVD z_len+8(FP), R5
MOVD a+56(FP), R4 // R4 = a = c MOVD x_base+24(FP), R6
MOVD z_len+8(FP), R11 // R11 = z_len MOVD z_base+0(FP), R7
// compute unrolled loop lengths
CMP R11, $0 ANDCC $3, R5, R8
BEQ done SRD $2, R5
loop1:
MOVD 0(R8), R20 CMP R8, $0; BEQ loop1done; MOVD R8, CTR
ADD $-1, R11 loop1cont:
MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) // unroll 1X
MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) MOVD 0(R6), R9
ADDC R4, R6 // R6 = z0 + r // multiply
ADDZE R7, R4 // R4 = z1 + CA MULHDU R3, R9, R10
CMP R11, $0 MULLD R3, R9
MOVD R6, 0(R10) // z[i] ADDC R4, R9
BEQ done ADDE R0, R10, R4
MOVD R9, 0(R7)
// We will read 4 elements per iteration ADD $8, R6
SRDCC $2, R11, R14 // R14 = z_len/4 ADD $8, R7
DCBT (R8) BDNZ loop1cont
MOVD R14, CTR // Set up the loop counter loop1done:
BEQ tail // If R9 = 0, we can't use the loop loop4:
PCALIGN $16 CMP R5, $0; BEQ loop4done; MOVD R5, CTR
loop4cont:
loop: // unroll 4X
MOVD 8(R8), R20 // R20 = x[i] MOVD 0(R6), R8
MOVD 16(R8), R21 // R21 = x[i+1] MOVD 8(R6), R9
MOVD 24(R8), R22 // R22 = x[i+2] MOVD 16(R6), R10
MOVDU 32(R8), R23 // R23 = x[i+3] MOVD 24(R6), R11
MULLD R9, R20, R24 // R24 = z0[i] // multiply
MULHDU R9, R20, R20 // R20 = z1[i] MULHDU R3, R8, R12
ADDC R4, R24 // R24 = z0[i] + c MULLD R3, R8
MULLD R9, R21, R25 ADDC R4, R8
MULHDU R9, R21, R21 MULHDU R3, R9, R14
ADDE R20, R25 MULLD R3, R9
MULLD R9, R22, R26 ADDE R12, R9
MULHDU R9, R22, R22 MULHDU R3, R10, R12
MULLD R9, R23, R27 MULLD R3, R10
MULHDU R9, R23, R23 ADDE R14, R10
ADDE R21, R26 MULHDU R3, R11, R14
MOVD R24, 8(R10) // z[i] MULLD R3, R11
MOVD R25, 16(R10) // z[i+1] ADDE R12, R11
ADDE R22, R27 ADDE R0, R14, R4
ADDZE R23,R4 // update carry MOVD R8, 0(R7)
MOVD R26, 24(R10) // z[i+2] MOVD R9, 8(R7)
MOVDU R27, 32(R10) // z[i+3] MOVD R10, 16(R7)
ADD $-4, R11 // R11 = z_len - 4 MOVD R11, 24(R7)
BDNZ loop ADD $32, R6
ADD $32, R7
// We may have some elements to read BDNZ loop4cont
CMP R11, $0 loop4done:
BEQ done
// Process the remaining elements, one at a time
tail:
MOVDU 8(R8), R20 // R20 = x[i]
MULLD R9, R20, R24 // R24 = z0[i]
MULHDU R9, R20, R25 // R25 = z1[i]
ADD $-1, R11 // R11 = z_len - 1
ADDC R4, R24
ADDZE R25, R4
MOVDU R24, 8(R10) // z[i]
CMP R11, $0
BEQ done // If R11 = 0, we are done
MOVDU 8(R8), R20
MULLD R9, R20, R24
MULHDU R9, R20, R25
ADD $-1, R11
ADDC R4, R24
ADDZE R25, R4
MOVDU R24, 8(R10)
CMP R11, $0
BEQ done
MOVD 8(R8), R20
MULLD R9, R20, R24
MULHDU R9, R20, R25
ADD $-1, R11
ADDC R4, R24
ADDZE R25,R4
MOVD R24, 8(R10)
done:
MOVD R4, c+64(FP) MOVD R4, c+64(FP)
RET RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word) // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R22 // R22 = z[] MOVD m+72(FP), R3
MOVD x+24(FP), R3 // R3 = x[] MOVD a+80(FP), R4
MOVD y+48(FP), R4 // R4 = y[] MOVD z_len+8(FP), R5
MOVD m+72(FP), R5 // R5 = m MOVD x_base+24(FP), R6
MOVD z_len+8(FP), R6 // R6 = z_len MOVD y_base+48(FP), R7
MOVD z_base+0(FP), R8
CMP R6, $4 // compute unrolled loop lengths
MOVD a+80(FP), R9 // R9 = c = a ANDCC $3, R5, R9
BLT tail SRD $2, R5
SRD $2, R6, R7 loop1:
MOVD R7, CTR // Initialize loop counter CMP R9, $0; BEQ loop1done; MOVD R9, CTR
PCALIGN $16 loop1cont:
// unroll 1X
loop: MOVD 0(R6), R10
MOVD 0(R4), R14 // y[i] MOVD 0(R7), R11
MOVD 8(R4), R16 // y[i+1] // multiply
MOVD 16(R4), R18 // y[i+2] MULHDU R3, R11, R12
MOVD 24(R4), R20 // y[i+3] MULLD R3, R11
MOVD 0(R3), R15 // x[i] ADDC R4, R11
MOVD 8(R3), R17 // x[i+1] ADDE R0, R12, R4
MOVD 16(R3), R19 // x[i+2] // add
MOVD 24(R3), R21 // x[i+3] ADDC R10, R11
MULLD R5, R14, R10 // low y[i]*m ADDE R0, R4
MULHDU R5, R14, R11 // high y[i]*m MOVD R11, 0(R8)
ADDC R15, R10 ADD $8, R6
ADDZE R11 ADD $8, R7
ADDC R9, R10 ADD $8, R8
ADDZE R11, R9 BDNZ loop1cont
MULLD R5, R16, R14 // low y[i+1]*m loop1done:
MULHDU R5, R16, R15 // high y[i+1]*m loop4:
ADDC R17, R14 CMP R5, $0; BEQ loop4done; MOVD R5, CTR
ADDZE R15 loop4cont:
// unroll 4X
MOVD 0(R6), R9
MOVD 8(R6), R10
MOVD 16(R6), R11
MOVD 24(R6), R12
MOVD 0(R7), R14
MOVD 8(R7), R15
MOVD 16(R7), R16
MOVD 24(R7), R17
// multiply
MULHDU R3, R14, R18
MULLD R3, R14
ADDC R4, R14
MULHDU R3, R15, R19
MULLD R3, R15
ADDE R18, R15
MULHDU R3, R16, R18
MULLD R3, R16
ADDE R19, R16
MULHDU R3, R17, R19
MULLD R3, R17
ADDE R18, R17
ADDE R0, R19, R4
// add
ADDC R9, R14 ADDC R9, R14
ADDZE R15, R9 ADDE R10, R15
MULLD R5, R18, R16 // low y[i+2]*m ADDE R11, R16
MULHDU R5, R18, R17 // high y[i+2]*m ADDE R12, R17
ADDC R19, R16 ADDE R0, R4
ADDZE R17 MOVD R14, 0(R8)
ADDC R9, R16 MOVD R15, 8(R8)
ADDZE R17, R9 MOVD R16, 16(R8)
MULLD R5, R20, R18 // low y[i+3]*m MOVD R17, 24(R8)
MULHDU R5, R20, R19 // high y[i+3]*m ADD $32, R6
ADDC R21, R18 ADD $32, R7
ADDZE R19 ADD $32, R8
ADDC R9, R18 BDNZ loop4cont
ADDZE R19, R9 loop4done:
MOVD R10, 0(R22) // z[i] MOVD R4, c+88(FP)
MOVD R14, 8(R22) // z[i+1]
MOVD R16, 16(R22) // z[i+2]
MOVD R18, 24(R22) // z[i+3]
ADD $32, R3
ADD $32, R4
ADD $32, R22
BDNZ loop
ANDCC $3, R6
tail:
CMP R6, $0
BEQ done
MOVD R6, CTR
PCALIGN $16
tailloop:
MOVD 0(R4), R14
MOVD 0(R3), R15
MULLD R5, R14, R10
MULHDU R5, R14, R11
ADDC R15, R10
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MOVD R10, 0(R22)
ADD $8, R3
ADD $8, R4
ADD $8, R22
BDNZ tailloop
done:
MOVD R9, c+88(FP)
RET RET

View File

@ -1,353 +1,457 @@
// Copyright 2020 The Go Authors. All rights reserved. // Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
//go:build !math_big_pure_go && riscv64 // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h" #include "textflag.h"
// This file provides fast assembly versions for the elementary // func addVV(z, x, y []Word) (c Word)
// arithmetic operations on vectors implemented in arith.go.
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
MOV x+24(FP), X5 MOV z_len+8(FP), X5
MOV y+48(FP), X6 MOV x_base+24(FP), X6
MOV z+0(FP), X7 MOV y_base+48(FP), X7
MOV z_len+8(FP), X30 MOV z_base+0(FP), X8
// compute unrolled loop lengths
MOV $4, X28 AND $3, X5, X9
MOV $0, X29 // c = 0 SRL $2, X5
XOR X28, X28 // clear carry
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 0(X6), X9 // y[0]
MOV 8(X5), X11 // x[1]
MOV 8(X6), X12 // y[1]
MOV 16(X5), X14 // x[2]
MOV 16(X6), X15 // y[2]
MOV 24(X5), X17 // x[3]
MOV 24(X6), X18 // y[3]
ADD X8, X9, X21 // z[0] = x[0] + y[0]
SLTU X8, X21, X22
ADD X21, X29, X10 // z[0] = x[0] + y[0] + c
SLTU X21, X10, X23
ADD X22, X23, X29 // next c
ADD X11, X12, X24 // z[1] = x[1] + y[1]
SLTU X11, X24, X25
ADD X24, X29, X13 // z[1] = x[1] + y[1] + c
SLTU X24, X13, X26
ADD X25, X26, X29 // next c
ADD X14, X15, X21 // z[2] = x[2] + y[2]
SLTU X14, X21, X22
ADD X21, X29, X16 // z[2] = x[2] + y[2] + c
SLTU X21, X16, X23
ADD X22, X23, X29 // next c
ADD X17, X18, X21 // z[3] = x[3] + y[3]
SLTU X17, X21, X22
ADD X21, X29, X19 // z[3] = x[3] + y[3] + c
SLTU X21, X19, X23
ADD X22, X23, X29 // next c
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X6
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
loop1: loop1:
MOV 0(X5), X10 // x BEQZ X9, loop1done
MOV 0(X6), X11 // y loop1cont:
// unroll 1X
ADD X10, X11, X12 // z = x + y MOV 0(X6), X10
SLTU X10, X12, X14 MOV 0(X7), X11
ADD X12, X29, X13 // z = x + y + c ADD X11, X10 // ADCS X11, X10, X10 (cr=X28)
SLTU X12, X13, X15 SLTU X11, X10, X31 // ...
ADD X14, X15, X29 // next c ADD X28, X10 // ...
SLTU X28, X10, X28 // ...
MOV X13, 0(X7) // z ADD X31, X28 // ...
MOV X10, 0(X8)
ADD $8, X5
ADD $8, X6 ADD $8, X6
ADD $8, X7 ADD $8, X7
SUB $1, X30 ADD $8, X8
SUB $1, X9
BNEZ X30, loop1 BNEZ X9, loop1cont
loop1done:
done: loop4:
MOV X29, c+72(FP) // return c BEQZ X5, loop4done
loop4cont:
// unroll 4X
MOV 0(X6), X9
MOV 8(X6), X10
MOV 16(X6), X11
MOV 24(X6), X12
MOV 0(X7), X13
MOV 8(X7), X14
MOV 16(X7), X15
MOV 24(X7), X16
ADD X13, X9 // ADCS X13, X9, X9 (cr=X28)
SLTU X13, X9, X31 // ...
ADD X28, X9 // ...
SLTU X28, X9, X28 // ...
ADD X31, X28 // ...
ADD X14, X10 // ADCS X14, X10, X10 (cr=X28)
SLTU X14, X10, X31 // ...
ADD X28, X10 // ...
SLTU X28, X10, X28 // ...
ADD X31, X28 // ...
ADD X15, X11 // ADCS X15, X11, X11 (cr=X28)
SLTU X15, X11, X31 // ...
ADD X28, X11 // ...
SLTU X28, X11, X28 // ...
ADD X31, X28 // ...
ADD X16, X12 // ADCS X16, X12, X12 (cr=X28)
SLTU X16, X12, X31 // ...
ADD X28, X12 // ...
SLTU X28, X12, X28 // ...
ADD X31, X28 // ...
MOV X9, 0(X8)
MOV X10, 8(X8)
MOV X11, 16(X8)
MOV X12, 24(X8)
ADD $32, X6
ADD $32, X7
ADD $32, X8
SUB $1, X5
BNEZ X5, loop4cont
loop4done:
MOV X28, c+72(FP)
RET RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
MOV x+24(FP), X5 MOV z_len+8(FP), X5
MOV y+48(FP), X6 MOV x_base+24(FP), X6
MOV z+0(FP), X7 MOV y_base+48(FP), X7
MOV z_len+8(FP), X30 MOV z_base+0(FP), X8
// compute unrolled loop lengths
MOV $4, X28 AND $3, X5, X9
MOV $0, X29 // b = 0 SRL $2, X5
XOR X28, X28 // clear carry
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 0(X6), X9 // y[0]
MOV 8(X5), X11 // x[1]
MOV 8(X6), X12 // y[1]
MOV 16(X5), X14 // x[2]
MOV 16(X6), X15 // y[2]
MOV 24(X5), X17 // x[3]
MOV 24(X6), X18 // y[3]
SUB X9, X8, X21 // z[0] = x[0] - y[0]
SLTU X21, X8, X22
SUB X29, X21, X10 // z[0] = x[0] - y[0] - b
SLTU X10, X21, X23
ADD X22, X23, X29 // next b
SUB X12, X11, X24 // z[1] = x[1] - y[1]
SLTU X24, X11, X25
SUB X29, X24, X13 // z[1] = x[1] - y[1] - b
SLTU X13, X24, X26
ADD X25, X26, X29 // next b
SUB X15, X14, X21 // z[2] = x[2] - y[2]
SLTU X21, X14, X22
SUB X29, X21, X16 // z[2] = x[2] - y[2] - b
SLTU X16, X21, X23
ADD X22, X23, X29 // next b
SUB X18, X17, X21 // z[3] = x[3] - y[3]
SLTU X21, X17, X22
SUB X29, X21, X19 // z[3] = x[3] - y[3] - b
SLTU X19, X21, X23
ADD X22, X23, X29 // next b
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X6
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
loop1: loop1:
MOV 0(X5), X10 // x BEQZ X9, loop1done
MOV 0(X6), X11 // y loop1cont:
// unroll 1X
SUB X11, X10, X12 // z = x - y MOV 0(X6), X10
SLTU X12, X10, X14 MOV 0(X7), X11
SUB X29, X12, X13 // z = x - y - b SLTU X28, X10, X31 // SBCS X11, X10, X10
SLTU X13, X12, X15 SUB X28, X10 // ...
ADD X14, X15, X29 // next b SLTU X11, X10, X28 // ...
SUB X11, X10 // ...
MOV X13, 0(X7) // z ADD X31, X28 // ...
MOV X10, 0(X8)
ADD $8, X5
ADD $8, X6 ADD $8, X6
ADD $8, X7 ADD $8, X7
SUB $1, X30 ADD $8, X8
SUB $1, X9
BNEZ X30, loop1 BNEZ X9, loop1cont
loop1done:
done: loop4:
MOV X29, c+72(FP) // return b BEQZ X5, loop4done
loop4cont:
// unroll 4X
MOV 0(X6), X9
MOV 8(X6), X10
MOV 16(X6), X11
MOV 24(X6), X12
MOV 0(X7), X13
MOV 8(X7), X14
MOV 16(X7), X15
MOV 24(X7), X16
SLTU X28, X9, X31 // SBCS X13, X9, X9
SUB X28, X9 // ...
SLTU X13, X9, X28 // ...
SUB X13, X9 // ...
ADD X31, X28 // ...
SLTU X28, X10, X31 // SBCS X14, X10, X10
SUB X28, X10 // ...
SLTU X14, X10, X28 // ...
SUB X14, X10 // ...
ADD X31, X28 // ...
SLTU X28, X11, X31 // SBCS X15, X11, X11
SUB X28, X11 // ...
SLTU X15, X11, X28 // ...
SUB X15, X11 // ...
ADD X31, X28 // ...
SLTU X28, X12, X31 // SBCS X16, X12, X12
SUB X28, X12 // ...
SLTU X16, X12, X28 // ...
SUB X16, X12 // ...
ADD X31, X28 // ...
MOV X9, 0(X8)
MOV X10, 8(X8)
MOV X11, 16(X8)
MOV X12, 24(X8)
ADD $32, X6
ADD $32, X7
ADD $32, X8
SUB $1, X5
BNEZ X5, loop4cont
loop4done:
MOV X28, c+72(FP)
RET RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0 TEXT ·lshVU(SB), NOSPLIT, $0
JMP ·lshVU_g(SB) MOV z_len+8(FP), X5
BEQZ X5, ret0
MOV s+48(FP), X6
MOV x_base+24(FP), X7
MOV z_base+0(FP), X8
// run loop backward
SLL $3, X5, X9
ADD X9, X7
SLL $3, X5, X9
ADD X9, X8
// shift first word into carry
MOV -8(X7), X9
MOV $64, X10
SUB X6, X10
SRL X10, X9, X11
SLL X6, X9
MOV X11, c+56(FP)
// shift remaining words
SUB $1, X5
// compute unrolled loop lengths
AND $3, X5, X11
SRL $2, X5
loop1:
BEQZ X11, loop1done
loop1cont:
// unroll 1X
MOV -16(X7), X12
SRL X10, X12, X13
OR X9, X13
SLL X6, X12, X9
MOV X13, -8(X8)
ADD $-8, X7
ADD $-8, X8
SUB $1, X11
BNEZ X11, loop1cont
loop1done:
loop4:
BEQZ X5, loop4done
loop4cont:
// unroll 4X
MOV -16(X7), X11
MOV -24(X7), X12
MOV -32(X7), X13
MOV -40(X7), X14
SRL X10, X11, X15
OR X9, X15
SLL X6, X11, X9
SRL X10, X12, X11
OR X9, X11
SLL X6, X12, X9
SRL X10, X13, X12
OR X9, X12
SLL X6, X13, X9
SRL X10, X14, X13
OR X9, X13
SLL X6, X14, X9
MOV X15, -8(X8)
MOV X11, -16(X8)
MOV X12, -24(X8)
MOV X13, -32(X8)
ADD $-32, X7
ADD $-32, X8
SUB $1, X5
BNEZ X5, loop4cont
loop4done:
// store final shifted bits
MOV X9, -8(X8)
RET
ret0:
MOV X0, c+56(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0 TEXT ·rshVU(SB), NOSPLIT, $0
JMP ·rshVU_g(SB) MOV z_len+8(FP), X5
BEQZ X5, ret0
MOV s+48(FP), X6
MOV x_base+24(FP), X7
MOV z_base+0(FP), X8
// shift first word into carry
MOV 0(X7), X9
MOV $64, X10
SUB X6, X10
SLL X10, X9, X11
SRL X6, X9
MOV X11, c+56(FP)
// shift remaining words
SUB $1, X5
// compute unrolled loop lengths
AND $3, X5, X11
SRL $2, X5
loop1:
BEQZ X11, loop1done
loop1cont:
// unroll 1X
MOV 8(X7), X12
SLL X10, X12, X13
OR X9, X13
SRL X6, X12, X9
MOV X13, 0(X8)
ADD $8, X7
ADD $8, X8
SUB $1, X11
BNEZ X11, loop1cont
loop1done:
loop4:
BEQZ X5, loop4done
loop4cont:
// unroll 4X
MOV 8(X7), X11
MOV 16(X7), X12
MOV 24(X7), X13
MOV 32(X7), X14
SLL X10, X11, X15
OR X9, X15
SRL X6, X11, X9
SLL X10, X12, X11
OR X9, X11
SRL X6, X12, X9
SLL X10, X13, X12
OR X9, X12
SRL X6, X13, X9
SLL X10, X14, X13
OR X9, X13
SRL X6, X14, X9
MOV X15, 0(X8)
MOV X11, 8(X8)
MOV X12, 16(X8)
MOV X13, 24(X8)
ADD $32, X7
ADD $32, X8
SUB $1, X5
BNEZ X5, loop4cont
loop4done:
// store final shifted bits
MOV X9, 0(X8)
RET
ret0:
MOV X0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOV x+24(FP), X5 MOV m+48(FP), X5
MOV m+48(FP), X6 MOV a+56(FP), X6
MOV z+0(FP), X7 MOV z_len+8(FP), X7
MOV z_len+8(FP), X30 MOV x_base+24(FP), X8
MOV a+56(FP), X29 MOV z_base+0(FP), X9
// compute unrolled loop lengths
MOV $4, X28 AND $3, X7, X10
SRL $2, X7
BEQ ZERO, X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 8(X5), X11 // x[1]
MOV 16(X5), X14 // x[2]
MOV 24(X5), X17 // x[3]
MULHU X8, X6, X9 // z_hi[0] = x[0] * m
MUL X8, X6, X8 // z_lo[0] = x[0] * m
ADD X8, X29, X10 // z[0] = z_lo[0] + c
SLTU X8, X10, X23
ADD X23, X9, X29 // next c
MULHU X11, X6, X12 // z_hi[1] = x[1] * m
MUL X11, X6, X11 // z_lo[1] = x[1] * m
ADD X11, X29, X13 // z[1] = z_lo[1] + c
SLTU X11, X13, X23
ADD X23, X12, X29 // next c
MULHU X14, X6, X15 // z_hi[2] = x[2] * m
MUL X14, X6, X14 // z_lo[2] = x[2] * m
ADD X14, X29, X16 // z[2] = z_lo[2] + c
SLTU X14, X16, X23
ADD X23, X15, X29 // next c
MULHU X17, X6, X18 // z_hi[3] = x[3] * m
MUL X17, X6, X17 // z_lo[3] = x[3] * m
ADD X17, X29, X19 // z[3] = z_lo[3] + c
SLTU X17, X19, X23
ADD X23, X18, X29 // next c
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
loop1: loop1:
MOV 0(X5), X10 // x BEQZ X10, loop1done
loop1cont:
MULHU X10, X6, X12 // z_hi = x * m // unroll 1X
MUL X10, X6, X10 // z_lo = x * m MOV 0(X8), X11
ADD X10, X29, X13 // z_lo + c // synthetic carry, one column at a time
SLTU X10, X13, X15 MUL X5, X11, X12
ADD X12, X15, X29 // next c MULHU X5, X11, X13
ADD X6, X12, X11 // ADDS X6, X12, X11 (cr=X28)
MOV X13, 0(X7) // z SLTU X6, X11, X28 // ...
ADD X28, X13, X6 // ADC $0, X13, X6
ADD $8, X5 MOV X11, 0(X9)
ADD $8, X7 ADD $8, X8
SUB $1, X30 ADD $8, X9
SUB $1, X10
BNEZ X30, loop1 BNEZ X10, loop1cont
loop1done:
done: loop4:
MOV X29, c+64(FP) // return c BEQZ X7, loop4done
loop4cont:
// unroll 4X
MOV 0(X8), X10
MOV 8(X8), X11
MOV 16(X8), X12
MOV 24(X8), X13
// synthetic carry, one column at a time
MUL X5, X10, X14
MULHU X5, X10, X15
ADD X6, X14, X10 // ADDS X6, X14, X10 (cr=X28)
SLTU X6, X10, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MUL X5, X11, X14
MULHU X5, X11, X15
ADD X6, X14, X11 // ADDS X6, X14, X11 (cr=X28)
SLTU X6, X11, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MUL X5, X12, X14
MULHU X5, X12, X15
ADD X6, X14, X12 // ADDS X6, X14, X12 (cr=X28)
SLTU X6, X12, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MUL X5, X13, X14
MULHU X5, X13, X15
ADD X6, X14, X13 // ADDS X6, X14, X13 (cr=X28)
SLTU X6, X13, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
MOV X10, 0(X9)
MOV X11, 8(X9)
MOV X12, 16(X9)
MOV X13, 24(X9)
ADD $32, X8
ADD $32, X9
SUB $1, X7
BNEZ X7, loop4cont
loop4done:
MOV X6, c+64(FP)
RET RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0 TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOV y+48(FP), X5 MOV m+72(FP), X5
MOV m+72(FP), X6 MOV a+80(FP), X6
MOV x+24(FP), X7 MOV z_len+8(FP), X7
MOV z+0(FP), X20 MOV x_base+24(FP), X8
MOV z_len+8(FP), X30 MOV y_base+48(FP), X9
MOV z_base+0(FP), X10
MOV $4, X28 // compute unrolled loop lengths
MOV a+80(FP), X29 // c = a AND $3, X7, X11
SRL $2, X7
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // y[0]
MOV 0(X7), X10 // x[0]
MOV 8(X5), X11 // y[1]
MOV 8(X7), X13 // x[1]
MOV 16(X5), X14 // y[2]
MOV 16(X7), X16 // x[2]
MOV 24(X5), X17 // y[3]
MOV 24(X7), X19 // x[3]
MULHU X8, X6, X9 // x_hi[0] = y[0] * m
MUL X8, X6, X8 // x_lo[0] = y[0] * m
ADD X8, X10, X21 // x_lo[0] = y[0] * m + x[0]
SLTU X8, X21, X22
ADD X9, X22, X9 // x_hi[0] = y[0] * m + x[0]
ADD X21, X29, X10 // x[0] = y[0] * m + x[0] + c
SLTU X21, X10, X22
ADD X9, X22, X29 // next c
MULHU X11, X6, X12 // x_hi[1] = y[1] * m
MUL X11, X6, X11 // x_lo[1] = y[1] * m
ADD X11, X13, X21 // x_lo[1] = y[1] * m + x[1]
SLTU X11, X21, X22
ADD X12, X22, X12 // x_hi[1] = y[1] * m + x[1]
ADD X21, X29, X13 // x[1] = y[1] * m + x[1] + c
SLTU X21, X13, X22
ADD X12, X22, X29 // next c
MULHU X14, X6, X15 // x_hi[2] = y[2] * m
MUL X14, X6, X14 // x_lo[2] = y[2] * m
ADD X14, X16, X21 // x_lo[2] = y[2] * m + x[2]
SLTU X14, X21, X22
ADD X15, X22, X15 // x_hi[2] = y[2] * m + x[2]
ADD X21, X29, X16 // x[2] = y[2] * m + x[2] + c
SLTU X21, X16, X22
ADD X15, X22, X29 // next c
MULHU X17, X6, X18 // x_hi[3] = y[3] * m
MUL X17, X6, X17 // x_lo[3] = y[3] * m
ADD X17, X19, X21 // x_lo[3] = y[3] * m + x[3]
SLTU X17, X21, X22
ADD X18, X22, X18 // x_hi[3] = y[3] * m + x[3]
ADD X21, X29, X19 // x[3] = y[3] * m + x[3] + c
SLTU X21, X19, X22
ADD X18, X22, X29 // next c
MOV X10, 0(X20) // z[0]
MOV X13, 8(X20) // z[1]
MOV X16, 16(X20) // z[2]
MOV X19, 24(X20) // z[3]
ADD $32, X5
ADD $32, X7
ADD $32, X20
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
loop1: loop1:
MOV 0(X5), X10 // y BEQZ X11, loop1done
MOV 0(X7), X11 // x loop1cont:
// unroll 1X
MULHU X10, X6, X12 // z_hi = y * m MOV 0(X8), X12
MUL X10, X6, X10 // z_lo = y * m MOV 0(X9), X13
ADD X10, X11, X13 // z_lo = y * m + x // synthetic carry, one column at a time
SLTU X10, X13, X15 MUL X5, X13, X14
ADD X12, X15, X12 // z_hi = y * m + x MULHU X5, X13, X15
ADD X13, X29, X10 // z = y * m + x + c ADD X12, X14 // ADDS X12, X14, X14 (cr=X28)
SLTU X13, X10, X15 SLTU X12, X14, X28 // ...
ADD X12, X15, X29 // next c ADD X28, X15 // ADC $0, X15, X15
ADD X6, X14, X13 // ADDS X6, X14, X13 (cr=X28)
MOV X10, 0(X20) // z SLTU X6, X13, X28 // ...
ADD X28, X15, X6 // ADC $0, X15, X6
ADD $8, X5 MOV X13, 0(X10)
ADD $8, X7 ADD $8, X8
ADD $8, X20 ADD $8, X9
SUB $1, X30 ADD $8, X10
SUB $1, X11
BNEZ X30, loop1 BNEZ X11, loop1cont
loop1done:
done: loop4:
MOV X29, c+88(FP) // return c BEQZ X7, loop4done
loop4cont:
// unroll 4X
MOV 0(X8), X11
MOV 8(X8), X12
MOV 16(X8), X13
MOV 24(X8), X14
MOV 0(X9), X15
MOV 8(X9), X16
MOV 16(X9), X17
MOV 24(X9), X18
// synthetic carry, one column at a time
MUL X5, X15, X19
MULHU X5, X15, X20
ADD X11, X19 // ADDS X11, X19, X19 (cr=X28)
SLTU X11, X19, X28 // ...
ADD X28, X20 // ADC $0, X20, X20
ADD X6, X19, X15 // ADDS X6, X19, X15 (cr=X28)
SLTU X6, X15, X28 // ...
ADD X28, X20, X6 // ADC $0, X20, X6
MUL X5, X16, X19
MULHU X5, X16, X20
ADD X12, X19 // ADDS X12, X19, X19 (cr=X28)
SLTU X12, X19, X28 // ...
ADD X28, X20 // ADC $0, X20, X20
ADD X6, X19, X16 // ADDS X6, X19, X16 (cr=X28)
SLTU X6, X16, X28 // ...
ADD X28, X20, X6 // ADC $0, X20, X6
MUL X5, X17, X19
MULHU X5, X17, X20
ADD X13, X19 // ADDS X13, X19, X19 (cr=X28)
SLTU X13, X19, X28 // ...
ADD X28, X20 // ADC $0, X20, X20
ADD X6, X19, X17 // ADDS X6, X19, X17 (cr=X28)
SLTU X6, X17, X28 // ...
ADD X28, X20, X6 // ADC $0, X20, X6
MUL X5, X18, X19
MULHU X5, X18, X20
ADD X14, X19 // ADDS X14, X19, X19 (cr=X28)
SLTU X14, X19, X28 // ...
ADD X28, X20 // ADC $0, X20, X20
ADD X6, X19, X18 // ADDS X6, X19, X18 (cr=X28)
SLTU X6, X18, X28 // ...
ADD X28, X20, X6 // ADC $0, X20, X6
MOV X15, 0(X10)
MOV X16, 8(X10)
MOV X17, 16(X10)
MOV X18, 24(X10)
ADD $32, X8
ADD $32, X9
ADD $32, X10
SUB $1, X7
BNEZ X7, loop4cont
loop4done:
MOV X6, c+88(FP)
RET RET

File diff suppressed because it is too large Load Diff

View File

@ -2,14 +2,18 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
//go:build s390x && !math_big_pure_go //go:build !math_big_pure_go
package big package big
import "testing" import "testing"
func TestNoVec(t *testing.T) { func TestAddVVNoVec(t *testing.T) {
// Make sure non-vector versions match vector versions. setDuringTest(t, &hasVX, false)
t.Run("AddVV", func(t *testing.T) { testVV(t, "addVV_novec", addVV_novec, addVV) }) TestAddVV(t)
t.Run("SubVV", func(t *testing.T) { testVV(t, "subVV_novec", subVV_novec, subVV) }) }
func TestSubVVNoVec(t *testing.T) {
setDuringTest(t, &hasVX, false)
TestSubVV(t)
} }

View File

@ -8,11 +8,7 @@ package big
import "internal/cpu" import "internal/cpu"
func addVV_check(z, x, y []Word) (c Word)
func addVV_vec(z, x, y []Word) (c Word)
func addVV_novec(z, x, y []Word) (c Word)
func subVV_check(z, x, y []Word) (c Word)
func subVV_vec(z, x, y []Word) (c Word)
func subVV_novec(z, x, y []Word) (c Word)
var hasVX = cpu.S390X.HasVX var hasVX = cpu.S390X.HasVX
func addVVvec(z, x, y []Word) (c Word)
func subVVvec(z, x, y []Word) (c Word)

View File

@ -0,0 +1,310 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !math_big_pure_go
#include "textflag.h"
TEXT ·addVVvec(SB), NOSPLIT, $0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R2
MOVD $0, R4 // c = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3
BLT v1
SUB $12, R3 // n -= 16
BLT A1 // if n < 0 goto A1
MOVD R8, R5
MOVD R9, R6
MOVD R2, R7
// n >= 0
// regular loop body unrolled 16x
VZERO V0 // c = 0
UU1:
VLM 0(R5), V1, V4 // 64-bytes into V1..V8
ADD $64, R5
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
ADD $64, R6
VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
VACCCQ V1, V9, V0, V25
VACQ V1, V9, V0, V17
VACCCQ V2, V10, V25, V26
VACQ V2, V10, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
VACCCQ V3, V11, V26, V27
VACQ V3, V11, V26, V19
VACCCQ V4, V12, V27, V28
VACQ V4, V12, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
VACCCQ V5, V13, V28, V29
VACQ V5, V13, V28, V21
VACCCQ V6, V14, V29, V30
VACQ V6, V14, V29, V22
VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
VACCCQ V7, V15, V30, V31
VACQ V7, V15, V30, V23
VACCCQ V8, V16, V31, V0 // V0 has carry-over
VACQ V8, V16, V31, V24
VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4
NEG R4, R4 // save cf
A1:
ADD $12, R3 // n += 16
// s/JL/JMP/ below to disable the unrolled loop
BLT v1 // if n < 0 goto v1
U1: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
ADDC R4, R4 // restore CF
MOVD 0(R9)(R10*1), R11
ADDE R11, R5
MOVD 8(R9)(R10*1), R11
ADDE R11, R6
MOVD 16(R9)(R10*1), R11
ADDE R11, R7
MOVD 24(R9)(R10*1), R11
ADDE R11, R1
MOVD R0, R4
ADDE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4
SUB $4, R3 // n -= 4
BGE U1 // if n >= 0 goto U1
v1:
ADD $4, R3 // n += 4
BLE E1 // if n <= 0 goto E1
L1: // n > 0
ADDC R4, R4 // restore CF
MOVD 0(R8)(R10*1), R5
MOVD 0(R9)(R10*1), R11
ADDE R11, R5
MOVD R5, 0(R2)(R10*1)
MOVD R0, R4
ADDE R4, R4 // save CF
NEG R4, R4
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L1 // if n > 0 goto L1
E1:
NEG R4, R4
MOVD R4, c+72(FP) // return c
RET
TEXT ·subVVvec(SB), NOSPLIT, $0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R2
MOVD $0, R4 // c = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v1 // if n < 0 goto v1
SUB $12, R3 // n -= 16
BLT A1 // if n < 0 goto A1
MOVD R8, R5
MOVD R9, R6
MOVD R2, R7
// n >= 0
// regular loop body unrolled 16x
VZERO V0 // cf = 0
MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
VLVGG $1, R4, V0 // put carry into V0
UU1:
VLM 0(R5), V1, V4 // 64-bytes into V1..V8
ADD $64, R5
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
ADD $64, R6
VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
VSBCBIQ V1, V9, V0, V25
VSBIQ V1, V9, V0, V17
VSBCBIQ V2, V10, V25, V26
VSBIQ V2, V10, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
VSBCBIQ V3, V11, V26, V27
VSBIQ V3, V11, V26, V19
VSBCBIQ V4, V12, V27, V28
VSBIQ V4, V12, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
VSBCBIQ V5, V13, V28, V29
VSBIQ V5, V13, V28, V21
VSBCBIQ V6, V14, V29, V30
VSBIQ V6, V14, V29, V22
VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
VSBCBIQ V7, V15, V30, V31
VSBIQ V7, V15, V30, V23
VSBCBIQ V8, V16, V31, V0 // V0 has carry-over
VSBIQ V8, V16, V31, V24
VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4
SUB $1, R4 // save cf
A1:
ADD $12, R3 // n += 16
BLT v1 // if n < 0 goto v1
U1: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
MOVD R0, R11
SUBC R4, R11 // restore CF
MOVD 0(R9)(R10*1), R11
SUBE R11, R5
MOVD 8(R9)(R10*1), R11
SUBE R11, R6
MOVD 16(R9)(R10*1), R11
SUBE R11, R7
MOVD 24(R9)(R10*1), R11
SUBE R11, R1
MOVD R0, R4
SUBE R4, R4 // save CF
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4
SUB $4, R3 // n -= 4
BGE U1 // if n >= 0 goto U1n
v1:
ADD $4, R3 // n += 4
BLE E1 // if n <= 0 goto E1
L1: // n > 0
MOVD R0, R11
SUBC R4, R11 // restore CF
MOVD 0(R8)(R10*1), R5
MOVD 0(R9)(R10*1), R11
SUBE R11, R5
MOVD R5, 0(R2)(R10*1)
MOVD R0, R4
SUBE R4, R4 // save CF
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L1 // if n > 0 goto L1n
E1:
NEG R4, R4
MOVD R4, c+72(FP) // return c
RET

View File

@ -15,7 +15,6 @@ import (
var generateFlag = flag.Bool("generate", false, "generate files") var generateFlag = flag.Bool("generate", false, "generate files")
func Test(t *testing.T) { func Test(t *testing.T) {
t.Skip("assembly not yet installed")
for _, arch := range arches { for _, arch := range arches {
t.Run(arch.Name, func(t *testing.T) { t.Run(arch.Name, func(t *testing.T) {
file, data := generate(arch) file, data := generate(arch)