math/big: replace addVW/subVW assembly with fast pure Go

The vast majority of the time, carry propagation is limited and
addVW/subVW only need to consider a single word for carry propagation.
As Josh Bleecher-Snyder pointed out in 2019 (CL 164968), once carrying
is done, the remaining words can be handled faster with copy (memmove).
In the benchmarks below, this is the data=random case.

Even more important, if the source and destination are the same,
the copy can be optimized away entirely, making a small in-place
addition to a big.Int O(1) instead of O(N). To date, only a few
systems (amd64, arm64, and pure Go, meaning wasm) make use of this
asymptotic improvement. This is the data=shortcut case.

This CL deletes the addVW/subVW assembly and replaces it with
an optimized pure Go version. Using Go makes it easy to call
the real copy builtin, which will use optimized memmove code,
instead of recreating a worse memmove in assembly (as arm64 does)
or omitting the copy optimization entirely (as most others do).

The worst case for the Go version versus assembly is the case
of incrementing 2^N-1 by 1, which has to propagate a carry
the entire length of the array. This is the data=carry case.
On balance, we believe this case is rare enough to be worth
taking a hit in that case, in exchange for significant wins
in the other cases and the deletion of significant amounts of
assembly of varying quality. (Remember that half the assembly has
the copy optimization and shortcut, while half does not.)

In the benchmarks, the systems are:

	c2s16     GOARCH=amd64     c2s16 perf gomote (Intel, Google Cloud)
	c3h88     GOARCH=amd64     c3h88 perf gomote (newer Intel, Google Cloud)
	s7        GOARCH=amd64     rsc basement server (AMD Ryzen 9 7950X)
	c4as16    GOARCH=arm64     c4as16 perf gomote (Google Cloud)
	mac       GOARCH=arm64     Apple M3 Pro in MacBook Pro
	386       GOARCH=386       gotip-linux-386 gomote
	arm       GOARCH=arm       gotip-linux-arm gomote
	loong64   GOARCH=loong64   gotip-linux-loong64 gomote
	ppc64le   GOARCH=ppc64le   gotip-linux-ppc64le gomote
	riscv64   GOARCH=riscv64   gotip-linux-riscv64 gomote

benchmark \ system                    c2s16     c3h88       s7    c4as16       mac       386      arm  loong64   ppc64le  riscv64

AddVW/words=1/data=random            -1.15%    -1.74%   -5.89%    -9.80%   -11.54%   +23.71%  -12.74%  -14.25%   +14.67%  +10.27%
AddVW/words=2/data=random            -2.59%         ~   -4.38%   -19.31%   -15.41%   +24.80%        ~  -19.99%   +13.73%  +19.71%
AddVW/words=3/data=random            -3.75%   -19.10%   -3.79%   -23.15%   -17.04%   +20.04%  -10.07%  -23.20%         ~  +15.39%
AddVW/words=4/data=random            -2.84%    +7.05%   -8.77%   -22.64%   -15.77%   +16.01%   -7.36%  -28.22%         ~  +23.00%
AddVW/words=5/data=random           -10.97%    +2.16%  -12.09%   -20.89%   -17.14%    +9.42%   -4.69%  -32.60%         ~  +10.07%
AddVW/words=6/data=random            -9.87%         ~   -7.54%   -19.08%    -6.46%         ~   -3.44%  -34.61%         ~  +12.19%
AddVW/words=7/data=random           -14.36%         ~  -10.09%   -19.10%   -10.47%    -6.20%   -5.06%  -38.14%   -11.54%   +6.79%
AddVW/words=8/data=random           -17.50%         ~  -11.06%   -25.14%   -12.88%    -8.35%   -5.11%  -41.39%   -14.04%  +11.87%
AddVW/words=9/data=random           -19.76%    -4.05%  -15.47%   -24.08%   -16.50%   -12.34%  -21.56%  -44.25%   -14.82%        ~
AddVW/words=10/data=random          -13.89%         ~   -9.69%   -23.06%    -8.04%   -12.58%  -19.25%  -32.80%   -11.68%        ~
AddVW/words=16/data=random          -29.36%   -15.35%  -21.86%   -25.04%   -19.89%   -32.26%  -16.29%  -42.66%   -25.92%   -3.01%
AddVW/words=32/data=random          -39.02%   -28.76%  -39.87%   -11.22%    -2.85%   -55.40%  -31.17%  -55.37%   -37.92%  -16.28%
AddVW/words=64/data=random          -25.94%   -19.09%  -20.60%    -6.90%    +8.91%   -51.00%  -43.72%  -62.27%   -44.11%  -28.74%
AddVW/words=100/data=random         -22.79%   -18.13%  -18.25%         ~   +33.89%   -67.40%  -51.77%  -63.54%   -53.75%  -30.97%
AddVW/words=1000/data=random         -8.98%    -3.84%        ~    -3.15%         ~   -93.35%  -63.92%  -65.66%   -68.67%  -42.30%
AddVW/words=10000/data=random        -1.38%    -0.38%        ~         ~         ~   -89.16%  -65.18%  -44.65%   -70.35%  -20.08%
AddVW/words=100000/data=random            ~         ~        ~         ~         ~   -87.03%  -64.51%  -36.08%   -61.40%  -16.53%

SubVW/words=1/data=random            -3.67%         ~   -8.38%   -10.26%    -3.07%   +45.78%   -6.06%  -11.17%         ~        ~
SubVW/words=2/data=random            -3.48%   -10.07%   -5.76%   -20.14%    -8.45%   +44.28%        ~  -19.09%         ~  +16.98%
SubVW/words=3/data=random            -7.11%   -26.64%   -4.48%   -22.07%    -9.21%   +35.61%        ~  -23.93%   -18.20%        ~
SubVW/words=4/data=random            -4.23%    +7.19%   -8.95%   -22.62%   -13.89%   +33.20%   -8.96%  -29.96%         ~  +22.23%
SubVW/words=5/data=random           -11.49%    +1.92%  -10.86%   -22.27%   -17.53%   +24.48%   -2.88%  -35.19%   -19.55%        ~
SubVW/words=6/data=random            -7.67%         ~   -7.72%   -18.44%    -6.24%   +12.03%   -2.00%  -39.68%   -10.73%        ~
SubVW/words=7/data=random           -13.69%   -18.32%  -11.82%   -18.92%   -11.57%    +6.63%        ~  -43.54%   -30.81%        ~
SubVW/words=8/data=random           -16.02%         ~  -11.07%   -24.50%   -11.92%    +4.32%   -3.01%  -46.95%   -24.14%        ~
SubVW/words=9/data=random           -18.76%    -3.34%  -14.84%   -23.79%   -17.50%         ~  -21.80%  -49.98%   -29.62%        ~
SubVW/words=10/data=random          -13.23%         ~   -9.25%   -21.26%   -11.63%         ~  -18.58%  -39.19%   -20.09%        ~
SubVW/words=16/data=random          -28.25%   -13.24%  -22.66%   -27.18%   -19.13%   -23.38%  -20.24%  -51.01%   -28.06%   -3.05%
SubVW/words=32/data=random          -38.41%   -28.88%  -40.12%   -11.20%    -2.80%   -49.17%  -34.67%  -63.29%   -39.25%  -15.20%
SubVW/words=64/data=random          -25.51%   -19.24%  -22.20%    -6.57%    +9.98%   -48.52%  -48.14%  -69.50%   -49.44%  -27.92%
SubVW/words=100/data=random         -21.69%   -18.51%        ~    +1.92%   +34.42%   -65.88%  -54.67%  -71.24%   -58.88%  -30.71%
SubVW/words=1000/data=random         -9.81%    -4.05%   -2.14%    -3.06%         ~   -93.37%  -67.33%  -74.12%   -68.36%  -42.17%
SubVW/words=10000/data=random             ~    -0.52%        ~         ~         ~   -88.87%  -68.54%  -44.94%   -70.63%  -19.95%
SubVW/words=100000/data=random            ~         ~        ~         ~         ~   -86.69%  -68.09%  -48.36%   -62.42%  -19.32%

AddVW/words=1/data=shortcut         -29.38%   -25.38%  -27.37%   -23.15%   -25.41%    +3.01%  -33.60%  -36.12%   -15.76%        ~
AddVW/words=2/data=shortcut         -32.79%   -34.72%  -31.47%   -24.47%   -28.21%    -3.75%  -34.66%  -43.89%   -23.65%  -21.56%
AddVW/words=3/data=shortcut         -38.50%   -46.83%  -35.67%   -26.38%   -30.29%   -10.41%  -44.89%  -47.68%   -30.93%  -26.85%
AddVW/words=4/data=shortcut         -40.40%   -28.85%  -34.19%   -29.83%   -32.95%   -16.09%  -42.86%  -51.02%   -34.19%  -26.69%
AddVW/words=5/data=shortcut         -43.87%   -35.42%  -36.46%   -32.59%   -37.72%   -20.82%  -45.14%  -54.01%   -35.49%  -30.48%
AddVW/words=6/data=shortcut         -46.98%   -39.34%  -42.22%   -35.43%   -38.18%   -27.46%  -46.72%  -56.61%   -40.21%  -34.07%
AddVW/words=7/data=shortcut         -49.63%   -47.97%  -46.61%   -35.28%   -41.93%   -31.14%  -49.29%  -58.89%   -41.10%  -37.01%
AddVW/words=8/data=shortcut         -50.48%   -42.33%  -45.40%   -40.24%   -41.74%   -32.92%  -50.62%  -60.98%   -44.85%  -38.10%
AddVW/words=9/data=shortcut         -54.27%   -43.52%  -49.06%   -42.16%   -45.22%   -37.57%  -51.84%  -62.91%   -46.04%  -40.82%
AddVW/words=10/data=shortcut        -56.01%   -45.40%  -51.42%   -43.29%   -46.14%   -38.65%  -53.65%  -64.62%   -47.05%  -43.21%
AddVW/words=16/data=shortcut        -62.73%   -55.66%  -59.31%   -56.38%   -54.31%   -53.16%  -61.03%  -72.29%   -58.24%  -52.57%
AddVW/words=32/data=shortcut        -74.00%   -69.42%  -71.75%   -33.65%   -37.35%   -71.73%  -72.59%  -82.44%   -70.87%  -67.69%
AddVW/words=64/data=shortcut        -56.69%   -52.72%  -52.09%   -35.48%   -36.87%   -84.24%  -83.10%  -90.37%   -82.56%  -80.81%
AddVW/words=100/data=shortcut       -56.68%   -53.18%  -51.49%   -33.49%   -37.72%   -89.95%  -88.21%  -93.37%   -88.47%  -86.52%
AddVW/words=1000/data=shortcut      -56.68%   -52.45%  -51.66%   -35.31%   -36.65%   -98.88%  -98.62%  -99.24%   -98.78%  -98.41%
AddVW/words=10000/data=shortcut     -56.70%   -52.40%  -51.92%   -33.49%   -36.98%   -99.89%  -99.86%  -99.92%   -99.87%  -99.91%
AddVW/words=100000/data=shortcut    -56.67%   -52.46%  -52.38%   -35.31%   -37.20%   -99.99%  -99.99%  -99.99%   -99.99%  -99.99%

SubVW/words=1/data=shortcut         -29.80%   -20.71%  -26.94%   -23.24%   -25.33%   +26.97%  -32.02%  -37.85%   -40.20%  -12.67%
SubVW/words=2/data=shortcut         -35.47%   -36.38%  -31.93%   -25.43%   -30.18%   +18.96%  -33.48%  -46.48%   -39.38%  -18.65%
SubVW/words=3/data=shortcut         -39.22%   -49.96%  -36.90%   -25.82%   -30.96%   +12.53%  -40.67%  -51.07%   -43.71%  -23.78%
SubVW/words=4/data=shortcut         -40.46%   -24.90%  -34.66%   -29.87%   -33.97%    +4.60%  -42.32%  -54.92%   -42.83%  -22.45%
SubVW/words=5/data=shortcut         -43.84%   -34.17%  -38.00%   -32.55%   -37.27%    -2.46%  -43.09%  -58.18%   -45.70%  -26.45%
SubVW/words=6/data=shortcut         -47.69%   -37.49%  -42.73%   -35.90%   -37.73%    -8.52%  -46.55%  -61.01%   -44.00%  -30.14%
SubVW/words=7/data=shortcut         -49.45%   -50.66%  -46.88%   -34.77%   -41.64%   -14.46%  -48.92%  -63.46%   -50.47%  -33.39%
SubVW/words=8/data=shortcut         -50.45%   -39.31%  -47.14%   -40.47%   -41.70%   -15.77%  -50.21%  -65.64%   -47.71%  -34.01%
SubVW/words=9/data=shortcut         -54.28%   -43.07%  -49.42%   -41.34%   -44.99%   -19.39%  -51.55%  -67.61%   -56.92%  -36.82%
SubVW/words=10/data=shortcut        -56.85%   -47.88%  -50.92%   -42.76%   -45.67%   -23.60%  -53.04%  -69.34%   -60.18%  -39.43%
SubVW/words=16/data=shortcut        -62.36%   -54.83%  -58.80%   -55.83%   -53.74%   -41.04%  -60.16%  -76.75%   -60.56%  -48.63%
SubVW/words=32/data=shortcut        -73.68%   -68.64%  -71.57%   -33.52%   -37.34%   -64.73%  -72.67%  -85.89%   -71.87%  -64.56%
SubVW/words=64/data=shortcut        -56.68%   -51.66%  -52.56%   -34.75%   -37.54%   -80.30%  -83.58%  -92.39%   -83.41%  -78.70%
SubVW/words=100/data=shortcut       -56.68%   -50.97%  -51.57%   -33.68%   -36.78%   -87.42%  -88.53%  -94.84%   -88.87%  -84.96%
SubVW/words=1000/data=shortcut      -56.68%   -50.89%  -52.10%   -34.94%   -37.77%   -98.59%  -98.71%  -99.43%   -98.80%  -98.20%
SubVW/words=10000/data=shortcut     -56.68%   -51.00%  -52.44%   -33.65%   -37.27%   -99.86%  -99.87%  -99.94%   -99.88%  -99.90%
SubVW/words=100000/data=shortcut    -56.68%   -50.80%  -52.20%   -34.79%   -37.46%   -99.99%  -99.99%  -99.99%   -99.99%  -99.99%

AddVW/words=1/data=carry             -0.51%    -5.29%  -24.03%   -26.48%         ~         ~  -33.14%  -30.23%         ~  -20.74%
AddVW/words=2/data=carry             -6.36%         ~  -21.05%   -39.40%         ~   +10.72%  -29.12%  -31.34%         ~  -17.29%
AddVW/words=3/data=carry                  ~         ~  -17.46%   -19.53%   +17.58%         ~  -26.23%  -23.61%    +7.80%  -14.34%
AddVW/words=4/data=carry            +19.02%   +16.80%        ~         ~   +28.25%         ~  -27.90%  -20.31%   +19.16%        ~
AddVW/words=5/data=carry             +3.97%   +53.02%        ~         ~   +11.31%         ~  -19.05%  -17.47%   +16.81%        ~
AddVW/words=6/data=carry             +2.98%   +19.83%        ~         ~   +14.84%         ~  -18.48%  -14.92%   +18.25%        ~
AddVW/words=7/data=carry                  ~         ~        ~         ~   +27.17%         ~  -15.50%  -12.74%   +13.00%        ~
AddVW/words=8/data=carry             +0.58%   +22.32%        ~    +6.10%   +29.63%         ~  -13.04%        ~   +28.46%   +2.95%
AddVW/words=9/data=carry                  ~   +31.53%        ~         ~   +14.42%         ~  -11.32%        ~   +18.37%   +3.28%
AddVW/words=10/data=carry            +3.94%   +22.36%        ~    +6.29%   +19.22%         ~  -11.27%        ~   +20.10%   +3.91%
AddVW/words=16/data=carry            +2.82%   +14.23%        ~   +10.06%   +25.91%   -16.12%        ~        ~   +52.28%  +10.40%
AddVW/words=32/data=carry                 ~   +25.35%  +13.66%         ~   +34.89%   -34.39%   +6.51%  -18.71%   +41.06%  +19.42%
AddVW/words=64/data=carry           -42.03%         ~  -39.70%    +6.65%   +32.29%   -39.94%  +14.34%        ~   +19.68%  +20.86%
AddVW/words=100/data=carry          -33.95%   -34.28%  -39.65%         ~   +27.72%   -26.80%  +17.40%        ~   +26.39%  +23.32%
AddVW/words=1000/data=carry         -42.49%   -47.87%  -47.44%    +1.25%    +4.25%   -41.76%  +23.40%        ~   +25.48%  +27.99%
AddVW/words=10000/data=carry        -41.85%   -48.49%  -49.43%         ~         ~   -42.09%  +24.61%  -10.32%   +40.55%  +18.35%
AddVW/words=100000/data=carry       -28.18%   -48.13%  -48.24%    +1.35%         ~   -42.90%  +24.73%   -9.79%   +22.55%  +17.16%

SubVW/words=1/data=carry            -10.32%   -17.16%  -24.14%   -26.24%         ~   +18.43%  -34.10%  -29.54%    -9.57%        ~
SubVW/words=2/data=carry            -19.45%   -23.31%  -20.74%   -39.73%         ~   +15.74%  -28.13%  -30.21%         ~  -18.74%
SubVW/words=3/data=carry                  ~   -16.18%  -15.34%   -19.54%   +17.62%   +12.39%  -27.64%  -27.09%         ~  -14.97%
SubVW/words=4/data=carry            +11.67%   +24.42%        ~         ~   +25.11%   +14.07%  -28.08%  -26.18%         ~        ~
SubVW/words=5/data=carry             +8.08%   +25.64%        ~         ~   +10.35%    +8.12%  -21.75%  -25.50%         ~   -4.86%
SubVW/words=6/data=carry                  ~   +13.82%        ~         ~   +12.92%    +6.79%  -20.25%  -24.70%         ~   -2.74%
SubVW/words=7/data=carry                  ~         ~   +8.29%    +4.51%   +26.59%    +4.62%  -18.01%  -24.09%         ~   -1.26%
SubVW/words=8/data=carry                  ~   +23.16%  +16.19%    +6.16%   +25.46%    +6.74%  -15.57%  -22.74%         ~   +1.44%
SubVW/words=9/data=carry                  ~   +30.71%  +20.81%         ~   +12.36%         ~  -12.99%        ~         ~   +3.13%
SubVW/words=10/data=carry            +5.03%   +19.53%  +14.84%   +14.16%   +16.12%         ~  -11.64%  -16.00%   +15.45%   +3.29%
SubVW/words=16/data=carry           +14.42%   +15.58%  +33.07%   +11.43%   +24.65%         ~        ~  -21.90%   +25.59%   +9.40%
SubVW/words=32/data=carry                 ~   +27.57%  +46.58%         ~   +35.35%    -8.49%        ~  -24.04%   +11.86%  +18.40%
SubVW/words=64/data=carry           -24.34%   -27.83%  -20.90%   +13.34%   +37.17%   -14.90%        ~   -8.81%   +12.88%  +18.92%
SubVW/words=100/data=carry          -25.19%   -34.70%  -27.45%   +12.86%   +28.42%   -14.48%        ~        ~   +25.71%  +21.93%
SubVW/words=1000/data=carry         -24.93%   -47.86%  -47.26%    +2.66%         ~   -23.88%        ~        ~   +25.99%  +27.81%
SubVW/words=10000/data=carry        -24.17%   -36.48%  -49.41%    +1.06%         ~   -25.06%        ~  -26.50%   +27.94%  +18.36%
SubVW/words=100000/data=carry       -22.51%   -35.86%  -49.46%    +3.96%         ~   -25.18%        ~  -22.15%   +26.86%  +15.44%

Change-Id: I8f252073040e674780ac6ec9912082fb205329dd
Reviewed-on: https://go-review.googlesource.com/c/go/+/664898
Reviewed-by: Alan Donovan <adonovan@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Russ Cox 2025-04-07 17:13:20 -04:00
parent b44b360dd4
commit a11643df8f
16 changed files with 96 additions and 1000 deletions

View File

@ -175,9 +175,6 @@ func TestIntendedInlining(t *testing.T) {
},
"math/big": {
"bigEndianWord",
// The following functions require the math_big_pure_go build tag.
"addVW",
"subVW",
},
"math/rand": {
"(*rngSource).Int63",

View File

@ -10,7 +10,10 @@
package big
import "math/bits"
import (
"math/bits"
_ "unsafe" // for go:linkname
)
// A Word represents a single digit of a multi-precision unsigned integer.
type Word uint
@ -82,11 +85,50 @@ func subVV_g(z, x, y []Word) (c Word) {
return
}
// The resulting carry c is either 0 or 1.
func addVW_g(z, x []Word, y Word) (c Word) {
// addVW sets z = x + y, returning the final carry c.
// The behavior is undefined if len(x) != len(z).
// If len(z) == 0, c = y; otherwise, c is 0 or 1.
//
// addVW should be an internal detail,
// but widely used packages access it using linkname.
// Notable members of the hall of shame include:
// - github.com/remyoudompheng/bigfft
//
// Do not remove or change the type signature.
// See go.dev/issue/67401.
//
//go:linkname addVW
func addVW(z, x []Word, y Word) (c Word) {
x = x[:len(z)]
if len(z) == 0 {
return y
}
zi, cc := bits.Add(uint(x[0]), uint(y), 0)
z[0] = Word(zi)
if cc == 0 {
if &z[0] != &x[0] {
copy(z[1:], x[1:])
}
return 0
}
for i := 1; i < len(z); i++ {
xi := x[i]
if xi != ^Word(0) {
z[i] = xi + 1
if &z[0] != &x[0] {
copy(z[i+1:], x[i+1:])
}
return 0
}
z[i] = 0
}
return 1
}
// addVW_ref is the reference implementation for addVW, used only for testing.
func addVW_ref(z, x []Word, y Word) (c Word) {
c = y
// The comment near the top of this file discusses this for loop condition.
for i := 0; i < len(z) && i < len(x); i++ {
for i := range z {
zi, cc := bits.Add(uint(x[i]), uint(c), 0)
z[i] = Word(zi)
c = Word(cc)
@ -94,53 +136,55 @@ func addVW_g(z, x []Word, y Word) (c Word) {
return
}
// addVWlarge is addVW, but intended for large z.
// The only difference is that we check on every iteration
// whether we are done with carries,
// and if so, switch to a much faster copy instead.
// This is only a good idea for large z,
// because the overhead of the check and the function call
// outweigh the benefits when z is small.
func addVWlarge(z, x []Word, y Word) (c Word) {
c = y
// The comment near the top of this file discusses this for loop condition.
for i := 0; i < len(z) && i < len(x); i++ {
if c == 0 {
copy(z[i:], x[i:])
return
}
zi, cc := bits.Add(uint(x[i]), uint(c), 0)
z[i] = Word(zi)
c = Word(cc)
// subVW sets z = x - y, returning the final carry c.
// The behavior is undefined if len(x) != len(z).
// If len(z) == 0, c = y; otherwise, c is 0 or 1.
//
// subVW should be an internal detail,
// but widely used packages access it using linkname.
// Notable members of the hall of shame include:
// - github.com/remyoudompheng/bigfft
//
// Do not remove or change the type signature.
// See go.dev/issue/67401.
//
//go:linkname subVW
func subVW(z, x []Word, y Word) (c Word) {
x = x[:len(z)]
if len(z) == 0 {
return y
}
return
zi, cc := bits.Sub(uint(x[0]), uint(y), 0)
z[0] = Word(zi)
if cc == 0 {
if &z[0] != &x[0] {
copy(z[1:], x[1:])
}
return 0
}
for i := 1; i < len(z); i++ {
xi := x[i]
if xi != 0 {
z[i] = xi - 1
if &z[0] != &x[0] {
copy(z[i+1:], x[i+1:])
}
return 0
}
z[i] = ^Word(0)
}
return 1
}
func subVW_g(z, x []Word, y Word) (c Word) {
// subVW_ref is the reference implementation for subVW, used only for testing.
func subVW_ref(z, x []Word, y Word) (c Word) {
c = y
// The comment near the top of this file discusses this for loop condition.
for i := 0; i < len(z) && i < len(x); i++ {
for i := range z {
zi, cc := bits.Sub(uint(x[i]), uint(c), 0)
z[i] = Word(zi)
c = Word(cc)
}
return
}
// subVWlarge is to subVW as addVWlarge is to addVW.
func subVWlarge(z, x []Word, y Word) (c Word) {
c = y
// The comment near the top of this file discusses this for loop condition.
for i := 0; i < len(z) && i < len(x); i++ {
if c == 0 {
copy(z[i:], x[i:])
return
}
zi, cc := bits.Sub(uint(x[i]), uint(c), 0)
z[i] = Word(zi)
c = Word(cc)
}
return
return c
}
func lshVU_g(z, x []Word, s uint) (c Word) {

View File

@ -60,51 +60,6 @@ E2: CMPL BX, BP // i < n
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), AX // c = y
MOVL z_len+4(FP), BP
MOVL $0, BX // i = 0
JMP E3
L3: ADDL (SI)(BX*4), AX
MOVL AX, (DI)(BX*4)
SBBL AX, AX // save CF
NEGL AX
ADDL $1, BX // i++
E3: CMPL BX, BP // i < n
JL L3
MOVL AX, c+28(FP)
RET
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), AX // c = y
MOVL z_len+4(FP), BP
MOVL $0, BX // i = 0
JMP E4
L4: MOVL (SI)(BX*4), DX
SUBL AX, DX
MOVL DX, (DI)(BX*4)
SBBL AX, AX // save CF
NEGL AX
ADDL $1, BX // i++
E4: CMPL BX, BP // i < n
JL L4
MOVL AX, c+28(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVL z_len+4(FP), BX // i = z

View File

@ -121,119 +121,6 @@ E2: NEGQ CX
MOVQ CX, c+72(FP) // return c
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0
MOVQ z_len+8(FP), DI
CMPQ DI, $32
JG large
MOVQ x+24(FP), R8
MOVQ y+48(FP), CX // c = y
MOVQ z+0(FP), R10
MOVQ $0, SI // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUBQ $4, DI // n -= 4
JL V3 // if n < 4 goto V3
U3: // n >= 0
// regular loop body unrolled 4x
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
ADDQ CX, R11
ADCQ $0, R12
ADCQ $0, R13
ADCQ $0, R14
SBBQ CX, CX // save CF
NEGQ CX
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
JGE U3 // if n >= 0 goto U3
V3: ADDQ $4, DI // n += 4
JLE E3 // if n <= 0 goto E3
L3: // n > 0
ADDQ 0(R8)(SI*8), CX
MOVQ CX, 0(R10)(SI*8)
SBBQ CX, CX // save CF
NEGQ CX
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JG L3 // if n > 0 goto L3
E3: MOVQ CX, c+56(FP) // return c
RET
large:
JMP ·addVWlarge(SB)
// func subVW(z, x []Word, y Word) (c Word)
// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
TEXT ·subVW(SB),NOSPLIT,$0
MOVQ z_len+8(FP), DI
CMPQ DI, $32
JG large
MOVQ x+24(FP), R8
MOVQ y+48(FP), CX // c = y
MOVQ z+0(FP), R10
MOVQ $0, SI // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUBQ $4, DI // n -= 4
JL V4 // if n < 4 goto V4
U4: // n >= 0
// regular loop body unrolled 4x
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
SUBQ CX, R11
SBBQ $0, R12
SBBQ $0, R13
SBBQ $0, R14
SBBQ CX, CX // save CF
NEGQ CX
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
JGE U4 // if n >= 0 goto U4
V4: ADDQ $4, DI // n += 4
JLE E4 // if n <= 0 goto E4
L4: // n > 0
MOVQ 0(R8)(SI*8), R11
SUBQ CX, R11
MOVQ R11, 0(R10)(SI*8)
SBBQ CX, CX // save CF
NEGQ CX
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JG L4 // if n > 0 goto L4
E4: MOVQ CX, c+56(FP) // return c
RET
large:
JMP ·subVWlarge(SB)
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVQ z_len+8(FP), BX // i = z

View File

@ -58,66 +58,6 @@ E2:
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0
MOVW z+0(FP), R1
MOVW z_len+4(FP), R4
MOVW x+12(FP), R2
MOVW y+24(FP), R3
ADD R4<<2, R1, R4
TEQ R1, R4
BNE L3a
MOVW R3, c+28(FP)
RET
L3a:
MOVW.P 4(R2), R5
ADD.S R3, R5
MOVW.P R5, 4(R1)
B E3
L3:
MOVW.P 4(R2), R5
ADC.S $0, R5
MOVW.P R5, 4(R1)
E3:
TEQ R1, R4
BNE L3
MOVW $0, R0
MOVW.CS $1, R0
MOVW R0, c+28(FP)
RET
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB),NOSPLIT,$0
MOVW z+0(FP), R1
MOVW z_len+4(FP), R4
MOVW x+12(FP), R2
MOVW y+24(FP), R3
ADD R4<<2, R1, R4
TEQ R1, R4
BNE L4a
MOVW R3, c+28(FP)
RET
L4a:
MOVW.P 4(R2), R5
SUB.S R3, R5
MOVW.P R5, 4(R1)
B E4
L4:
MOVW.P 4(R2), R5
SBC.S $0, R5
MOVW.P R5, 4(R1)
E4:
TEQ R1, R4
BNE L4
MOVW $0, R0
MOVW.CC $1, R0
MOVW R0, c+28(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVW z_len+4(FP), R5

View File

@ -93,164 +93,6 @@ done:
MOVD R0, c+72(FP)
RET
#define vwOneOp(instr, op1) \
MOVD.P 8(R1), R4; \
instr op1, R4; \
MOVD.P R4, 8(R3);
// handle the first 1~4 elements before starting iteration in addVW/subVW
#define vwPreIter(instr1, instr2, counter, target) \
vwOneOp(instr1, R2); \
SUB $1, counter; \
CBZ counter, target; \
vwOneOp(instr2, $0); \
SUB $1, counter; \
CBZ counter, target; \
vwOneOp(instr2, $0); \
SUB $1, counter; \
CBZ counter, target; \
vwOneOp(instr2, $0);
// do one iteration of add or sub in addVW/subVW
#define vwOneIter(instr, counter, exit) \
CBZ counter, exit; \ // careful not to touch the carry flag
LDP.P 32(R1), (R4, R5); \
LDP -16(R1), (R6, R7); \
instr $0, R4, R8; \
instr $0, R5, R9; \
instr $0, R6, R10; \
instr $0, R7, R11; \
STP.P (R8, R9), 32(R3); \
STP (R10, R11), -16(R3); \
SUB $4, counter;
// do one iteration of copy in addVW/subVW
#define vwOneIterCopy(counter, exit) \
CBZ counter, exit; \
LDP.P 32(R1), (R4, R5); \
LDP -16(R1), (R6, R7); \
STP.P (R4, R5), 32(R3); \
STP (R6, R7), -16(R3); \
SUB $4, counter;
// func addVW(z, x []Word, y Word) (c Word)
// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
// and switches to copy if we are done with carries. The copying is skipped as well
// if 'x' and 'z' happen to share the same underlying storage.
// The overhead of the checking and branching is visible when 'z' are small (~5%),
// so set a threshold of 32, and remain the small-sized part entirely untouched.
TEXT ·addVW(SB),NOSPLIT,$0
MOVD z+0(FP), R3
MOVD z_len+8(FP), R0
MOVD x+24(FP), R1
MOVD y+48(FP), R2
CMP $32, R0
BGE large // large-sized 'z' and 'x'
CBZ R0, len0 // the length of z is 0
MOVD.P 8(R1), R4
ADDS R2, R4 // z[0] = x[0] + y, set carry
MOVD.P R4, 8(R3)
SUB $1, R0
CBZ R0, len1 // the length of z is 1
TBZ $0, R0, two
MOVD.P 8(R1), R4 // do it once
ADCS $0, R4
MOVD.P R4, 8(R3)
SUB $1, R0
two: // do it twice
TBZ $1, R0, loop
LDP.P 16(R1), (R4, R5)
ADCS $0, R4, R8 // c, z[i] = x[i] + c
ADCS $0, R5, R9
STP.P (R8, R9), 16(R3)
SUB $2, R0
loop: // do four times per round
vwOneIter(ADCS, R0, len1)
B loop
len1:
CSET HS, R2 // extract carry flag
len0:
MOVD R2, c+56(FP)
done:
RET
large:
AND $0x3, R0, R10
AND $~0x3, R0
// unrolling for the first 1~4 elements to avoid saving the carry
// flag in each step, adjust $R0 if we unrolled 4 elements
vwPreIter(ADDS, ADCS, R10, add4)
SUB $4, R0
add4:
BCC copy
vwOneIter(ADCS, R0, len1)
B add4
copy:
MOVD ZR, c+56(FP)
CMP R1, R3
BEQ done
copy_4: // no carry flag, copy the rest
vwOneIterCopy(R0, done)
B copy_4
// func subVW(z, x []Word, y Word) (c Word)
// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
// and switches to copy if we are done with carries. The copying is skipped as well
// if 'x' and 'z' happen to share the same underlying storage.
// The overhead of the checking and branching is visible when 'z' are small (~5%),
// so set a threshold of 32, and remain the small-sized part entirely untouched.
TEXT ·subVW(SB),NOSPLIT,$0
MOVD z+0(FP), R3
MOVD z_len+8(FP), R0
MOVD x+24(FP), R1
MOVD y+48(FP), R2
CMP $32, R0
BGE large // large-sized 'z' and 'x'
CBZ R0, len0 // the length of z is 0
MOVD.P 8(R1), R4
SUBS R2, R4 // z[0] = x[0] - y, set carry
MOVD.P R4, 8(R3)
SUB $1, R0
CBZ R0, len1 // the length of z is 1
TBZ $0, R0, two // do it once
MOVD.P 8(R1), R4
SBCS $0, R4
MOVD.P R4, 8(R3)
SUB $1, R0
two: // do it twice
TBZ $1, R0, loop
LDP.P 16(R1), (R4, R5)
SBCS $0, R4, R8 // c, z[i] = x[i] + c
SBCS $0, R5, R9
STP.P (R8, R9), 16(R3)
SUB $2, R0
loop: // do four times per round
vwOneIter(SBCS, R0, len1)
B loop
len1:
CSET LO, R2 // extract carry flag
len0:
MOVD R2, c+56(FP)
done:
RET
large:
AND $0x3, R0, R10
AND $~0x3, R0
// unrolling for the first 1~4 elements to avoid saving the carry
// flag in each step, adjust $R0 if we unrolled 4 elements
vwPreIter(SUBS, SBCS, R10, sub4)
SUB $4, R0
sub4:
BCS copy
vwOneIter(SBCS, R0, len1)
B sub4
copy:
MOVD ZR, c+56(FP)
CMP R1, R3
BEQ done
copy_4: // no carry flag, copy the rest
vwOneIterCopy(R0, done)
B copy_4
// func lshVU(z, x []Word, s uint) (c Word)
// This implementation handles the shift operation from the high word to the low word,
// which may be an error for the case where the low word of x overlaps with the high

View File

@ -34,30 +34,6 @@ func addVV(z, x, y []Word) (c Word)
//go:noescape
func subVV(z, x, y []Word) (c Word)
// addVW should be an internal detail,
// but widely used packages access it using linkname.
// Notable members of the hall of shame include:
// - github.com/remyoudompheng/bigfft
//
// Do not remove or change the type signature.
// See go.dev/issue/67401.
//
//go:linkname addVW
//go:noescape
func addVW(z, x []Word, y Word) (c Word)
// subVW should be an internal detail,
// but widely used packages access it using linkname.
// Notable members of the hall of shame include:
// - github.com/remyoudompheng/bigfft
//
// Do not remove or change the type signature.
// See go.dev/issue/67401.
//
//go:linkname subVW
//go:noescape
func subVW(z, x []Word, y Word) (c Word)
// shlVU should be an internal detail (and a stale one at that),
// but widely used packages access it using linkname.
// Notable members of the hall of shame include:

View File

@ -14,24 +14,6 @@ func subVV(z, x, y []Word) (c Word) {
return subVV_g(z, x, y)
}
func addVW(z, x []Word, y Word) (c Word) {
// TODO: remove indirect function call when golang.org/issue/30548 is fixed
fn := addVW_g
if len(z) > 32 {
fn = addVWlarge
}
return fn(z, x, y)
}
func subVW(z, x []Word, y Word) (c Word) {
// TODO: remove indirect function call when golang.org/issue/30548 is fixed
fn := subVW_g
if len(z) > 32 {
fn = subVWlarge
}
return fn(z, x, y)
}
func lshVU(z, x []Word, s uint) (c Word) {
return lshVU_g(z, x, s)
}

View File

@ -42,56 +42,6 @@ done:
MOVV R8, c+72(FP)
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0
// input:
// R4: z
// R5: z_len
// R7: x
// R10: y
MOVV z+0(FP), R4
MOVV z_len+8(FP), R5
MOVV x+24(FP), R7
MOVV y+48(FP), R10
MOVV $0, R6
SLLV $3, R5
loop:
BEQ R5, R6, done
MOVV (R6)(R7), R8
ADDV R8, R10, R9 // x1 + c = z1, if z1 < x1 then z1 overflow
SGTU R8, R9, R10
MOVV R9, (R6)(R4)
ADDV $8, R6
JMP loop
done:
MOVV R10, c+56(FP)
RET
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB),NOSPLIT,$0
// input:
// R4: z
// R5: z_len
// R7: x
// R10: y
MOVV z+0(FP), R4
MOVV z_len+8(FP), R5
MOVV x+24(FP), R7
MOVV y+48(FP), R10
MOVV $0, R6
SLLV $3, R5
loop:
BEQ R5, R6, done
MOVV (R6)(R7), R8
SUBV R10, R8, R11 // x1 - c = z1, if z1 > x1 then overflow
SGTU R11, R8, R10
MOVV R11, (R6)(R4)
ADDV $8, R6
JMP loop
done:
MOVV R10, c+56(FP)
RET
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)

View File

@ -15,12 +15,6 @@ TEXT ·addVV(SB),NOSPLIT,$0
TEXT ·subVV(SB),NOSPLIT,$0
JMP ·subVV_g(SB)
TEXT ·addVW(SB),NOSPLIT,$0
JMP ·addVW_g(SB)
TEXT ·subVW(SB),NOSPLIT,$0
JMP ·subVW_g(SB)
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)

View File

@ -15,12 +15,6 @@ TEXT ·addVV(SB),NOSPLIT,$0
TEXT ·subVV(SB),NOSPLIT,$0
JMP ·subVV_g(SB)
TEXT ·addVW(SB),NOSPLIT,$0
JMP ·addVW_g(SB)
TEXT ·subVW(SB),NOSPLIT,$0
JMP ·subVW_g(SB)
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)

View File

@ -188,157 +188,6 @@ done:
MOVD R4, c+72(FP)
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R4 // R4 = y = c
MOVD z_len+8(FP), R11 // R11 = z_len
CMP R11, $0 // If z_len is zero, return
BEQ done
// We will process the first iteration out of the loop so we capture
// the value of c. In the subsequent iterations, we will rely on the
// value of CA set here.
MOVD 0(R8), R20 // R20 = x[i]
ADD $-1, R11 // R11 = z_len - 1
ADDC R20, R4, R6 // R6 = x[i] + c
CMP R11, $0 // If z_len was 1, we are done
MOVD R6, 0(R10) // z[i]
BEQ final
// We will read 4 elements per iteration
SRDCC $2, R11, R9 // R9 = z_len/4
DCBT (R8)
MOVD R9, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
PCALIGN $16
loop:
MOVD 8(R8), R20 // R20 = x[i]
MOVD 16(R8), R21 // R21 = x[i+1]
MOVD 24(R8), R22 // R22 = x[i+2]
MOVDU 32(R8), R23 // R23 = x[i+3]
ADDZE R20, R24 // R24 = x[i] + CA
ADDZE R21, R25 // R25 = x[i+1] + CA
ADDZE R22, R26 // R26 = x[i+2] + CA
ADDZE R23, R27 // R27 = x[i+3] + CA
MOVD R24, 8(R10) // z[i]
MOVD R25, 16(R10) // z[i+1]
MOVD R26, 24(R10) // z[i+2]
MOVDU R27, 32(R10) // z[i+3]
ADD $-4, R11 // R11 = z_len - 4
BDNZ loop
// We may have some elements to read
CMP R11, $0
BEQ final
tail:
MOVDU 8(R8), R20
ADDZE R20, R24
ADD $-1, R11
MOVDU R24, 8(R10)
CMP R11, $0
BEQ final
MOVDU 8(R8), R20
ADDZE R20, R24
ADD $-1, R11
MOVDU R24, 8(R10)
CMP R11, $0
BEQ final
MOVD 8(R8), R20
ADDZE R20, R24
MOVD R24, 8(R10)
final:
ADDZE R0, R4 // c = CA
done:
MOVD R4, c+56(FP)
RET
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R4 // R4 = y = c
MOVD z_len+8(FP), R11 // R11 = z_len
CMP R11, $0 // If z_len is zero, return
BEQ done
// We will process the first iteration out of the loop so we capture
// the value of c. In the subsequent iterations, we will rely on the
// value of CA set here.
MOVD 0(R8), R20 // R20 = x[i]
ADD $-1, R11 // R11 = z_len - 1
SUBC R4, R20, R6 // R6 = x[i] - c
CMP R11, $0 // If z_len was 1, we are done
MOVD R6, 0(R10) // z[i]
BEQ final
// We will read 4 elements per iteration
SRDCC $2, R11, R9 // R9 = z_len/4
DCBT (R8)
MOVD R9, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
// The loop here is almost the same as the one used in s390x, but
// we don't need to capture CA every iteration because we've already
// done that above.
PCALIGN $16
loop:
MOVD 8(R8), R20
MOVD 16(R8), R21
MOVD 24(R8), R22
MOVDU 32(R8), R23
SUBE R0, R20
SUBE R0, R21
SUBE R0, R22
SUBE R0, R23
MOVD R20, 8(R10)
MOVD R21, 16(R10)
MOVD R22, 24(R10)
MOVDU R23, 32(R10)
ADD $-4, R11
BDNZ loop
// We may have some elements to read
CMP R11, $0
BEQ final
tail:
MOVDU 8(R8), R20
SUBE R0, R20
ADD $-1, R11
MOVDU R20, 8(R10)
CMP R11, $0
BEQ final
MOVDU 8(R8), R20
SUBE R0, R20
ADD $-1, R11
MOVDU R20, 8(R10)
CMP R11, $0
BEQ final
MOVD 8(R8), R20
SUBE R0, R20
MOVD R20, 8(R10)
final:
// Capture CA
SUBE R4, R4
NEG R4, R4
done:
MOVD R4, c+56(FP)
RET
//func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVD z+0(FP), R3

View File

@ -173,126 +173,6 @@ done:
MOV X29, c+72(FP) // return b
RET
TEXT ·addVW(SB),NOSPLIT,$0
MOV x+24(FP), X5
MOV y+48(FP), X6
MOV z+0(FP), X7
MOV z_len+8(FP), X30
MOV $4, X28
MOV X6, X29 // c = y
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 8(X5), X11 // x[1]
MOV 16(X5), X14 // x[2]
MOV 24(X5), X17 // x[3]
ADD X8, X29, X10 // z[0] = x[0] + c
SLTU X8, X10, X29 // next c
ADD X11, X29, X13 // z[1] = x[1] + c
SLTU X11, X13, X29 // next c
ADD X14, X29, X16 // z[2] = x[2] + c
SLTU X14, X16, X29 // next c
ADD X17, X29, X19 // z[3] = x[3] + c
SLTU X17, X19, X29 // next c
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
loop1:
MOV 0(X5), X10 // x
ADD X10, X29, X12 // z = x + c
SLTU X10, X12, X29 // next c
MOV X12, 0(X7) // z
ADD $8, X5
ADD $8, X7
SUB $1, X30
BNEZ X30, loop1
done:
MOV X29, c+56(FP) // return c
RET
TEXT ·subVW(SB),NOSPLIT,$0
MOV x+24(FP), X5
MOV y+48(FP), X6
MOV z+0(FP), X7
MOV z_len+8(FP), X30
MOV $4, X28
MOV X6, X29 // b = y
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 8(X5), X11 // x[1]
MOV 16(X5), X14 // x[2]
MOV 24(X5), X17 // x[3]
SUB X29, X8, X10 // z[0] = x[0] - b
SLTU X10, X8, X29 // next b
SUB X29, X11, X13 // z[1] = x[1] - b
SLTU X13, X11, X29 // next b
SUB X29, X14, X16 // z[2] = x[2] - b
SLTU X16, X14, X29 // next b
SUB X29, X17, X19 // z[3] = x[3] - b
SLTU X19, X17, X29 // next b
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
loop1:
MOV 0(X5), X10 // x
SUB X29, X10, X12 // z = x - b
SLTU X12, X10, X29 // next b
MOV X12, 0(X7) // z
ADD $8, X5
ADD $8, X7
SUB $1, X30
BNEZ X30, loop1
done:
MOV X29, c+56(FP) // return b
RET
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)

View File

@ -500,188 +500,6 @@ E1:
MOVD R4, c+72(FP) // return c
RET
TEXT ·addVW(SB), NOSPLIT, $0
MOVD z_len+8(FP), R5 // length of z
MOVD x+24(FP), R6
MOVD y+48(FP), R7 // c = y
MOVD z+0(FP), R8
CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
ADDC 0(R6), R7
MOVD R7, 0(R8)
CMPBEQ R5, $1, returnResult // len(z) == 1
MOVD $0, R9
ADDE 8(R6), R9
MOVD R9, 8(R8)
CMPBEQ R5, $2, returnResult // len(z) == 2
// Update the counters
MOVD $16, R12 // i = 2
MOVD $-2(R5), R5 // n = n - 2
loopOverEachWord:
BRC $12, copySetup // carry = 0, copy the rest
MOVD $1, R9
// Originally we used the carry flag generated in the previous iteration
// (i.e: ADDE could be used here to do the addition). However, since we
// already know carry is 1 (otherwise we will go to copy section), we can use
// ADDC here so the current iteration does not depend on the carry flag
// generated in the previous iteration. This could be useful when branch prediction happens.
ADDC 0(R6)(R12*1), R9
MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
MOVD $8(R12), R12 // i++
BRCTG R5, loopOverEachWord // n--
// Return the current carry value
returnResult:
MOVD $0, R0
ADDE R0, R0
MOVD R0, c+56(FP)
RET
// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
// With the assumption that x and z will not overlap with each other or x and z will
// point to same memory region, we can use a faster version of copy using only MVC here.
// In the following implementation, we have three copy loops, each copying a word, 4 words, and
// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
copySetup:
ADD R12, R6
ADD R12, R8
CMPBGE R5, $4, mediumLoop
smallLoop: // does a loop unrolling to copy word when n < 4
CMPBEQ R5, $0, returnZero
MVC $8, 0(R6), 0(R8)
CMPBEQ R5, $1, returnZero
MVC $8, 8(R6), 8(R8)
CMPBEQ R5, $2, returnZero
MVC $8, 16(R6), 16(R8)
returnZero:
MOVD $0, c+56(FP) // return 0 as carry
RET
mediumLoop:
CMPBLT R5, $4, smallLoop
CMPBLT R5, $32, mediumLoopBody
largeLoop: // Copying 256 bytes at a time.
MVC $256, 0(R6), 0(R8)
MOVD $256(R6), R6
MOVD $256(R8), R8
MOVD $-32(R5), R5
CMPBGE R5, $32, largeLoop
BR mediumLoop
mediumLoopBody: // Copying 32 bytes at a time
MVC $32, 0(R6), 0(R8)
MOVD $32(R6), R6
MOVD $32(R8), R8
MOVD $-4(R5), R5
CMPBGE R5, $4, mediumLoopBody
BR smallLoop
returnC:
MOVD R7, c+56(FP)
RET
TEXT ·subVW(SB), NOSPLIT, $0
MOVD z_len+8(FP), R5
MOVD x+24(FP), R6
MOVD y+48(FP), R7 // The borrow bit passed in
MOVD z+0(FP), R8
MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it.
CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
MOVD 0(R6), R9
SUBC R7, R9
MOVD R9, 0(R8)
CMPBEQ R5, $1, returnResult
MOVD 8(R6), R9
SUBE R0, R9
MOVD R9, 8(R8)
CMPBEQ R5, $2, returnResult
// Update the counters
MOVD $16, R12 // i = 2
MOVD $-2(R5), R5 // n = n - 2
loopOverEachWord:
BRC $3, copySetup // no borrow, copy the rest
MOVD 0(R6)(R12*1), R9
// Originally we used the borrow flag generated in the previous iteration
// (i.e: SUBE could be used here to do the subtraction). However, since we
// already know borrow is 1 (otherwise we will go to copy section), we can
// use SUBC here so the current iteration does not depend on the borrow flag
// generated in the previous iteration. This could be useful when branch prediction happens.
SUBC $1, R9
MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
MOVD $8(R12), R12 // i++
BRCTG R5, loopOverEachWord // n--
// return the current borrow value
returnResult:
SUBE R0, R0
NEG R0, R0
MOVD R0, c+56(FP)
RET
// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
// With the assumption that x and z will not overlap with each other or x and z will
// point to same memory region, we can use a faster version of copy using only MVC here.
// In the following implementation, we have three copy loops, each copying a word, 4 words, and
// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
copySetup:
ADD R12, R6
ADD R12, R8
CMPBGE R5, $4, mediumLoop
smallLoop: // does a loop unrolling to copy word when n < 4
CMPBEQ R5, $0, returnZero
MVC $8, 0(R6), 0(R8)
CMPBEQ R5, $1, returnZero
MVC $8, 8(R6), 8(R8)
CMPBEQ R5, $2, returnZero
MVC $8, 16(R6), 16(R8)
returnZero:
MOVD $0, c+56(FP) // return 0 as borrow
RET
mediumLoop:
CMPBLT R5, $4, smallLoop
CMPBLT R5, $32, mediumLoopBody
largeLoop: // Copying 256 bytes at a time
MVC $256, 0(R6), 0(R8)
MOVD $256(R6), R6
MOVD $256(R8), R8
MOVD $-32(R5), R5
CMPBGE R5, $32, largeLoop
BR mediumLoop
mediumLoopBody: // Copying 32 bytes at a time
MVC $32, 0(R6), 0(R8)
MOVD $32(R6), R6
MOVD $32(R8), R8
MOVD $-4(R5), R5
CMPBGE R5, $4, mediumLoopBody
BR smallLoop
returnC:
MOVD R7, c+56(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
BR ·lshVU_g(SB)

View File

@ -28,8 +28,8 @@ var shifts = []uint{1, 2, 3, _W/4 - 1, _W / 4, _W/4 + 1, _W/2 - 1, _W / 2, _W/2
func TestAddVV(t *testing.T) { testVV(t, "addVV", addVV, addVV_g) }
func TestSubVV(t *testing.T) { testVV(t, "subVV", subVV, subVV_g) }
func TestAddVW(t *testing.T) { testVW(t, "addVW", addVW, addVW_g, words4) }
func TestSubVW(t *testing.T) { testVW(t, "subVW", subVW, subVW_g, words4) }
func TestAddVW(t *testing.T) { testVW(t, "addVW", addVW, addVW_ref, words4) }
func TestSubVW(t *testing.T) { testVW(t, "subVW", subVW, subVW_ref, words4) }
func TestLshVU(t *testing.T) { testVU(t, "lshVU", lshVU, lshVU_g, shifts) }
func TestRshVU(t *testing.T) { testVU(t, "rshVU", rshVU, rshVU_g, shifts) }
func TestMulAddVWW(t *testing.T) { testVWW(t, "mulAddVWW", mulAddVWW, mulAddVWW_g, muls) }
@ -865,21 +865,15 @@ func benchVV(fn func(z, x, y []Word) Word) benchFunc {
}
func BenchmarkAddVW(b *testing.B) {
bench(b, "/impl=asm/data=random", benchVW(addVW, 123))
bench(b, "/impl=asm/data=carry", benchCarryVW(addVW, ^Word(0), 1))
bench(b, "/impl=asm/data=shortcut", benchShortVW(addVW, 123))
bench(b, "/impl=go/data=random", benchVW(addVW_g, 123))
bench(b, "/impl=go/data=carry", benchCarryVW(addVW_g, ^Word(0), 1))
bench(b, "/impl=go/data=shortcut", benchShortVW(addVW_g, 123))
bench(b, "/data=random", benchVW(addVW, 123))
bench(b, "/data=carry", benchCarryVW(addVW, ^Word(0), 1))
bench(b, "/data=shortcut", benchShortVW(addVW, 123))
}
func BenchmarkSubVW(b *testing.B) {
bench(b, "/impl=asm/data=random", benchVW(subVW, 123))
bench(b, "/impl=asm/data=carry", benchCarryVW(subVW, 0, 1))
bench(b, "/impl=asm/data=shortcut", benchShortVW(subVW, 123))
bench(b, "/impl=go/data=random", benchVW(subVW_g, 123))
bench(b, "/impl=go/data=carry", benchCarryVW(subVW_g, 0, 1))
bench(b, "/impl=go/data=shortcut", benchShortVW(subVW_g, 123))
bench(b, "/data=random", benchVW(subVW, 123))
bench(b, "/data=carry", benchCarryVW(subVW, 0, 1))
bench(b, "/data=shortcut", benchShortVW(subVW, 123))
}
func benchVW(fn func(z, x []Word, w Word) Word, w Word) benchFunc {

View File

@ -12,12 +12,6 @@ TEXT ·addVV(SB),NOSPLIT,$0
TEXT ·subVV(SB),NOSPLIT,$0
JMP ·subVV_g(SB)
TEXT ·addVW(SB),NOSPLIT,$0
JMP ·addVW_g(SB)
TEXT ·subVW(SB),NOSPLIT,$0
JMP ·subVW_g(SB)
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)