crypto/sha256: improve performance of riscv64 assembly

Simplified the implementation of Ch and Maj by reducing instructions, based on CL 605495 which made the same change for SHA-512.

goos: linux
goarch: riscv64
pkg: crypto/sha256
cpu: Spacemit(R) X60
                    │  oldsha256  │              newsha256              │
                    │   sec/op    │   sec/op     vs base                │
Hash8Bytes/New-8      2.303µ ± 0%   2.098µ ± 0%   -8.90% (p=0.000 n=10)
Hash8Bytes/Sum224-8   2.535µ ± 0%   2.329µ ± 0%   -8.13% (p=0.000 n=10)
Hash8Bytes/Sum256-8   2.558µ ± 0%   2.352µ ± 0%   -8.04% (p=0.000 n=10)
Hash1K/New-8          28.67µ ± 0%   25.21µ ± 0%  -12.06% (p=0.000 n=10)
Hash1K/Sum224-8       28.89µ ± 0%   25.43µ ± 0%  -11.99% (p=0.000 n=10)
Hash1K/Sum256-8       28.91µ ± 0%   25.43µ ± 0%  -12.04% (p=0.000 n=10)
Hash8K/New-8          218.0µ ± 1%   192.7µ ± 2%  -11.58% (p=0.000 n=10)
Hash8K/Sum224-8       218.0µ ± 1%   193.6µ ± 1%  -11.20% (p=0.000 n=10)
Hash8K/Sum256-8       219.1µ ± 1%   193.4µ ± 1%  -11.74% (p=0.000 n=10)
geomean               24.93µ        22.28µ       -10.65%

                    │  oldsha256   │              newsha256               │
                    │     B/s      │     B/s       vs base                │
Hash8Bytes/New-8      3.309Mi ± 0%   3.633Mi ± 0%   +9.80% (p=0.000 n=10)
Hash8Bytes/Sum224-8   3.009Mi ± 0%   3.271Mi ± 0%   +8.72% (p=0.000 n=10)
Hash8Bytes/Sum256-8   2.985Mi ± 0%   3.242Mi ± 0%   +8.63% (p=0.000 n=10)
Hash1K/New-8          34.06Mi ± 0%   38.73Mi ± 0%  +13.72% (p=0.000 n=10)
Hash1K/Sum224-8       33.80Mi ± 0%   38.40Mi ± 0%  +13.63% (p=0.000 n=10)
Hash1K/Sum256-8       33.78Mi ± 0%   38.40Mi ± 0%  +13.69% (p=0.000 n=10)
Hash8K/New-8          35.84Mi ± 1%   40.54Mi ± 2%  +13.10% (p=0.000 n=10)
Hash8K/Sum224-8       35.83Mi ± 1%   40.35Mi ± 1%  +12.61% (p=0.000 n=10)
Hash8K/Sum256-8       35.66Mi ± 1%   40.40Mi ± 1%  +13.29% (p=0.000 n=10)
geomean               15.54Mi        17.39Mi       +11.89%

Change-Id: I9aa692fcfd70634dc6c308db9b5d06bd82ac2302
Reviewed-on: https://go-review.googlesource.com/c/go/+/639495
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
This commit is contained in:
Julian Zhu 2024-12-31 13:35:56 +08:00 committed by Meng Zhuo
parent 42d3cdc909
commit 06f96a598e

View File

@ -88,47 +88,46 @@
// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
// = ((y XOR z) AND x) XOR z
#define SHA256T1(index, e, f, g, h) \
MOVWU (index*4)(X18), X8; \
ADD X5, h; \
RORW $6, e, X6; \
ADD X8, h; \
RORW $11, e, X7; \
XOR X7, X6; \
RORW $25, e, X8; \
XOR X7, X6; \
XOR f, g, X5; \
XOR X8, X6; \
AND e, X5; \
ADD X6, h; \
AND e, f, X5; \
NOT e, X7; \
AND g, X7; \
XOR X7, X5; \
XOR g, X5; \
ADD h, X5
// Calculate T2 in X6.
// T2 = BIGSIGMA0(a) + Maj(a, b, c)
// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
// = ((y XOR z) AND x) XOR (y AND z)
#define SHA256T2(a, b, c) \
RORW $2, a, X6; \
RORW $13, a, X7; \
XOR X7, X6; \
RORW $22, a, X8; \
XOR X7, X6; \
XOR b, c, X9; \
AND b, c, X7; \
AND a, X9; \
XOR X8, X6; \
AND a, b, X7; \
AND a, c, X8; \
XOR X8, X7; \
AND b, c, X9; \
XOR X9, X7; \
ADD X7, X6
XOR X7, X9; \
ADD X9, X6
// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
// The values for e and a are stored in d and h, ready for rotation.
#define SHA256ROUND(index, a, b, c, d, e, f, g, h) \
SHA256T1(index, e, f, g, h); \
SHA256T2(a, b, c); \
MOV X6, h; \
ADD X5, d; \
ADD X5, h
ADD X6, X5, h
#define SHA256ROUND0(index, a, b, c, d, e, f, g, h) \
MSGSCHEDULE0(index); \