internal/chacha8rand: implement func block in assembly

Benchmark result on Loongson-3A6000:
goos: linux
goarch: loong64
pkg: math/rand/v2 + internal/chacha8rand
cpu: Loongson-3A6000-HV @ 2500.00MHz
                         |  bench.old   |              bench.new              |
                         |    sec/op    |   sec/op     vs base                |
ChaCha8MarshalBinary        67.39n ± 0%   65.96n ± 0%   -2.12% (p=0.000 n=10)
ChaCha8MarshalBinaryRead    80.93n ± 0%   78.31n ± 0%   -3.23% (p=0.000 n=10)
ChaCha8                    10.610n ± 0%   5.129n ± 0%  -51.66% (p=0.000 n=10)
ChaCha8Read                 51.30n ± 0%   28.05n ± 0%  -45.31% (p=0.000 n=10)
Block                      218.50n ± 0%   45.48n ± 0%  -79.19% (p=0.000 n=10)
geomean                     57.86n        32.05n       -44.62%

Benchmark result on Loongson-3A5000:
goos: linux
goarch: loong64
pkg: math/rand/v2 + internal/chacha8rand
cpu: Loongson-3A5000 @ 2500.00MHz
                         |  bench.old   |              bench.new              |
                         |    sec/op    |   sec/op     vs base                |
ChaCha8MarshalBinary        116.3n ± 0%   116.6n ± 0%   +0.26% (p=0.015 n=10)
ChaCha8MarshalBinaryRead    142.6n ± 0%   142.0n ± 0%   -0.39% (p=0.000 n=10)
ChaCha8                    16.270n ± 0%   6.848n ± 0%  -57.91% (p=0.000 n=10)
ChaCha8Read                 78.65n ± 0%   47.39n ± 1%  -39.74% (p=0.000 n=10)
Block                      301.50n ± 0%   91.85n ± 0%  -69.53% (p=0.000 n=10)
geomean                     91.45n        54.79n       -40.09%

Change-Id: I64d80c81d2df288fecff80ae23ef89f0fb54cdfa
Reviewed-on: https://go-review.googlesource.com/c/go/+/664035
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Xiaolin Zhao 2025-04-09 10:10:16 +08:00 committed by abner chenc
parent 005d7f29d1
commit b8ed752d6f
4 changed files with 157 additions and 3 deletions

View File

@ -68,7 +68,7 @@ var depsRules = `
unicode/utf16; unicode/utf16;
internal/goarch < internal/abi; internal/goarch < internal/abi;
internal/byteorder, internal/goarch < internal/chacha8rand; internal/byteorder, internal/cpu, internal/goarch < internal/chacha8rand;
# RUNTIME is the core runtime group of packages, all of them very light-weight. # RUNTIME is the core runtime group of packages, all of them very light-weight.
internal/abi, internal/abi,

View File

@ -7,7 +7,16 @@
// and must have minimal dependencies. // and must have minimal dependencies.
package chacha8rand package chacha8rand
import "internal/byteorder" import (
"internal/byteorder"
"internal/cpu"
"unsafe"
)
// Offsets into internal/cpu records for use in assembly.
const (
offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
)
const ( const (
ctrInc = 4 // increment counter by 4 between block calls ctrInc = 4 // increment counter by 4 between block calls

View File

@ -0,0 +1,145 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
DATA ·chachaConst+0x00(SB)/4, $0x61707865
DATA ·chachaConst+0x04(SB)/4, $0x3320646e
DATA ·chachaConst+0x08(SB)/4, $0x79622d32
DATA ·chachaConst+0x0c(SB)/4, $0x6b206574
GLOBL ·chachaConst(SB), NOPTR|RODATA, $32
DATA ·chachaIncRot+0x00(SB)/4, $0x00000000
DATA ·chachaIncRot+0x04(SB)/4, $0x00000001
DATA ·chachaIncRot+0x08(SB)/4, $0x00000002
DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003
GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32
// QR is the ChaCha8 quarter-round on a, b, c, and d.
#define QR(a, b, c, d) \
VADDW a, b, a; \
VXORV d, a, d; \
VROTRW $16, d; \
VADDW c, d, c; \
VXORV b, c, b; \
VROTRW $20, b; \
VADDW a, b, a; \
VXORV d, a, d; \
VROTRW $24, d; \
VADDW c, d, c; \
VXORV b, c, b; \
VROTRW $25, b
// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
// seed in R4
// blocks in R5
// counter in R6
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
BNE R7, lsx_chacha8
JMP ·block_generic<ABIInternal>(SB)
RET
lsx_chacha8:
MOVV $·chachaConst(SB), R10
MOVV $·chachaIncRot(SB), R11
// load contants
// VLDREPL.W $0, R10, V0
WORD $0x30200140
// VLDREPL.W $1, R10, V1
WORD $0x30200541
// VLDREPL.W $2, R10, V2
WORD $0x30200942
// VLDREPL.W $3, R10, V3
WORD $0x30200d43
// load 4-32bit data from incRotMatrix added to counter
VMOVQ (R11), V30
// load seed
// VLDREPL.W $0, R4, V4
WORD $0x30200084
// VLDREPL.W $1, R4, V5
WORD $0x30200485
// VLDREPL.W $2, R4, V6
WORD $0x30200886
// VLDREPL.W $3, R4, V7
WORD $0x30200c87
// VLDREPL.W $4, R4, V8
WORD $0x30201088
// VLDREPL.W $5, R4, V9
WORD $0x30201489
// VLDREPL.W $6, R4, V10
WORD $0x3020188a
// VLDREPL.W $7, R4, V11
WORD $0x30201c8b
// load counter and update counter
VMOVQ R6, V12.W4
VADDW V12, V30, V12
// zeros for remaining three matrix entries
VXORV V13, V13, V13
VXORV V14, V14, V14
VXORV V15, V15, V15
// save seed state for adding back later
VORV V4, V13, V20
VORV V5, V13, V21
VORV V6, V13, V22
VORV V7, V13, V23
VORV V8, V13, V24
VORV V9, V13, V25
VORV V10, V13, V26
VORV V11, V13, V27
// 4 iterations. Each iteration is 8 quarter-rounds.
MOVV $4, R7
loop:
QR(V0, V4, V8, V12)
QR(V1, V5, V9, V13)
QR(V2, V6, V10, V14)
QR(V3, V7, V11, V15)
QR(V0, V5, V10, V15)
QR(V1, V6, V11, V12)
QR(V2, V7, V8, V13)
QR(V3, V4, V9, V14)
SUBV $1, R7
BNE R7, R0, loop
// add seed back
VADDW V4, V20, V4
VADDW V5, V21, V5
VADDW V6, V22, V6
VADDW V7, V23, V7
VADDW V8, V24, V8
VADDW V9, V25, V9
VADDW V10, V26, V10
VADDW V11, V27, V11
// store blocks back to output buffer
VMOVQ V0, (R5)
VMOVQ V1, 16(R5)
VMOVQ V2, 32(R5)
VMOVQ V3, 48(R5)
VMOVQ V4, 64(R5)
VMOVQ V5, 80(R5)
VMOVQ V6, 96(R5)
VMOVQ V7, 112(R5)
VMOVQ V8, 128(R5)
VMOVQ V9, 144(R5)
VMOVQ V10, 160(R5)
VMOVQ V11, 176(R5)
VMOVQ V12, 192(R5)
VMOVQ V13, 208(R5)
VMOVQ V14, 224(R5)
VMOVQ V15, 240(R5)
RET

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
//go:build !amd64 && !arm64 //go:build !amd64 && !arm64 && !loong64
#include "textflag.h" #include "textflag.h"