From 8cbb512c76846fd95bffffa235364ca60e3ed0c0 Mon Sep 17 00:00:00 2001 From: Egon Elbre Date: Tue, 18 Feb 2025 14:15:54 +0200 Subject: [PATCH] crypto/internal/fips140/edwards25519/field: inline carryPropagate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Manually inlining carryPropagate seems to help quite a bit. goos: linux goarch: arm64 pkg: crypto/internal/fips140/edwards25519 │ OLD │ NEW │ │ sec/op │ sec/op vs base │ EncodingDecoding-4 141.4µ ± 0% 133.1µ ± 0% -5.90% (p=0.000 n=10) ScalarBaseMult-4 260.5µ ± 0% 254.0µ ± 0% -2.49% (p=0.000 n=10) ScalarMult-4 916.6µ ± 0% 892.5µ ± 0% -2.63% (p=0.000 n=10) VarTimeDoubleScalarBaseMult-4 909.6µ ± 0% 886.6µ ± 0% -2.52% (p=0.000 n=10) geomean 418.6µ 404.4µ -3.40% pkg: crypto/internal/fips140/edwards25519/field │ OLD │ NEW │ │ sec/op │ sec/op vs base │ Add-4 33.50n ± 0% 33.52n ± 0% +0.04% (p=0.013 n=10) Multiply-4 406.8n ± 0% 400.0n ± 0% -1.66% (p=0.000 n=10) Square-4 246.4n ± 0% 234.4n ± 0% -4.85% (p=0.000 n=10) Invert-4 67.37µ ± 0% 63.53µ ± 0% -5.69% (p=0.000 n=10) Mult32-4 78.68n ± 0% 78.67n ± 0% ~ (p=0.367 n=10) Bytes-4 110.6n ± 0% 110.8n ± 0% ~ (p=0.568 n=10) geomean 354.0n 346.8n -2.03% goos: darwin goarch: arm64 pkg: crypto/internal/fips140/edwards25519 cpu: Apple M1 Pro │ OLD │ NEW │ │ sec/op │ sec/op vs base │ EncodingDecoding-10 7.610µ ± 2% 7.459µ ± 0% -1.98% (p=0.000 n=10) ScalarBaseMult-10 11.54µ ± 0% 11.36µ ± 0% -1.53% (p=0.000 n=10) ScalarMult-10 38.59µ ± 2% 37.09µ ± 0% -3.88% (p=0.000 n=10) VarTimeDoubleScalarBaseMult-10 37.10µ ± 0% 35.79µ ± 0% -3.54% (p=0.000 n=10) geomean 18.83µ 18.31µ -2.74% pkg: crypto/internal/fips140/edwards25519/field │ OLD │ NEW │ │ sec/op │ sec/op vs base │ Add-10 5.116n ± 5% 5.168n ± 5% ~ (p=0.669 n=10) Multiply-10 18.00n ± 2% 16.90n ± 1% -6.09% (p=0.000 n=10) Square-10 13.66n ± 0% 13.48n ± 0% -1.28% (p=0.000 n=10) Invert-10 3.629µ ± 0% 3.508µ ± 2% -3.33% (p=0.000 n=10) Mult32-10 6.165n ± 2% 6.324n ± 1% +2.57% (p=0.000 n=10) Bytes-10 10.33n ± 8% 10.28n ± 4% ~ (p=0.516 n=10) geomean 25.74n 25.40n -1.31% tags: purego goos: windows goarch: amd64 pkg: crypto/internal/fips140/edwards25519 cpu: AMD Ryzen Threadripper 2950X 16-Core Processor │ OLD │ NEW │ │ sec/op │ sec/op vs base │ EncodingDecoding-32 9.557µ ± 1% 8.966µ ± 0% -6.18% (p=0.000 n=10) ScalarBaseMult-32 19.14µ ± 2% 19.00µ ± 1% ~ (p=0.190 n=10) ScalarMult-32 64.61µ ± 1% 65.83µ ± 2% +1.88% (p=0.003 n=10) VarTimeDoubleScalarBaseMult-32 62.36µ ± 1% 62.14µ ± 1% ~ (p=0.529 n=10) geomean 29.30µ 28.89µ -1.39% pkg: crypto/internal/fips140/edwards25519/field │ OLD │ NEW │ │ sec/op │ sec/op vs base │ Add-32 4.879n ± 1% 4.880n ± 1% ~ (p=0.739 n=10) Multiply-32 22.75n ± 2% 22.03n ± 3% -3.14% (p=0.000 n=10) Square-32 16.46n ± 2% 15.38n ± 1% -6.59% (p=0.000 n=10) Invert-32 4.466µ ± 1% 4.168µ ± 1% -6.67% (p=0.000 n=10) Mult32-32 5.311n ± 1% 5.492n ± 1% +3.40% (p=0.000 n=10) Bytes-32 11.51n ± 1% 11.53n ± 1% ~ (p=0.303 n=10) geomean 28.16n 27.54n -2.20% Change-Id: I6e60d2f1220661df4b4f2bf2d810864c19c03012 Reviewed-on: https://go-review.googlesource.com/c/go/+/650279 Auto-Submit: Filippo Valsorda LUCI-TryBot-Result: Go LUCI Reviewed-by: Ian Lance Taylor Reviewed-by: Michael Pratt Reviewed-by: Filippo Valsorda --- .../fips140/edwards25519/field/fe_generic.go | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_generic.go b/src/crypto/internal/fips140/edwards25519/field/fe_generic.go index 1d1a3e79a2..ef1f15a5dc 100644 --- a/src/crypto/internal/fips140/edwards25519/field/fe_generic.go +++ b/src/crypto/internal/fips140/edwards25519/field/fe_generic.go @@ -166,16 +166,21 @@ func feMulGeneric(v, a, b *Element) { c3 := shiftRightBy51(r3) c4 := shiftRightBy51(r4) - v.l0 = r0.lo&maskLow51Bits + mul19(c4) - v.l1 = r1.lo&maskLow51Bits + c0 - v.l2 = r2.lo&maskLow51Bits + c1 - v.l3 = r3.lo&maskLow51Bits + c2 - v.l4 = r4.lo&maskLow51Bits + c3 + rr0 := r0.lo&maskLow51Bits + mul19(c4) + rr1 := r1.lo&maskLow51Bits + c0 + rr2 := r2.lo&maskLow51Bits + c1 + rr3 := r3.lo&maskLow51Bits + c2 + rr4 := r4.lo&maskLow51Bits + c3 // Now all coefficients fit into 64-bit registers but are still too large to // be passed around as an Element. We therefore do one last carry chain, // where the carries will be small enough to fit in the wiggle room above 2⁵¹. - v.carryPropagate() + + v.l0 = rr0&maskLow51Bits + mul19(rr4>>51) + v.l1 = rr1&maskLow51Bits + rr0>>51 + v.l2 = rr2&maskLow51Bits + rr1>>51 + v.l3 = rr3&maskLow51Bits + rr2>>51 + v.l4 = rr4&maskLow51Bits + rr3>>51 } func feSquareGeneric(v, a *Element) { @@ -238,13 +243,17 @@ func feSquareGeneric(v, a *Element) { c3 := shiftRightBy51(r3) c4 := shiftRightBy51(r4) - v.l0 = r0.lo&maskLow51Bits + mul19(c4) - v.l1 = r1.lo&maskLow51Bits + c0 - v.l2 = r2.lo&maskLow51Bits + c1 - v.l3 = r3.lo&maskLow51Bits + c2 - v.l4 = r4.lo&maskLow51Bits + c3 + rr0 := r0.lo&maskLow51Bits + mul19(c4) + rr1 := r1.lo&maskLow51Bits + c0 + rr2 := r2.lo&maskLow51Bits + c1 + rr3 := r3.lo&maskLow51Bits + c2 + rr4 := r4.lo&maskLow51Bits + c3 - v.carryPropagate() + v.l0 = rr0&maskLow51Bits + mul19(rr4>>51) + v.l1 = rr1&maskLow51Bits + rr0>>51 + v.l2 = rr2&maskLow51Bits + rr1>>51 + v.l3 = rr3&maskLow51Bits + rr2>>51 + v.l4 = rr4&maskLow51Bits + rr3>>51 } // carryPropagate brings the limbs below 52 bits by applying the reduction