mirror of
https://github.com/golang/go.git
synced 2025-05-05 15:43:04 +00:00
internal/chacha8rand: implement func block in assembly
Benchmark result on Loongson-3A6000: goos: linux goarch: loong64 pkg: math/rand/v2 + internal/chacha8rand cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | ChaCha8MarshalBinary 67.39n ± 0% 65.96n ± 0% -2.12% (p=0.000 n=10) ChaCha8MarshalBinaryRead 80.93n ± 0% 78.31n ± 0% -3.23% (p=0.000 n=10) ChaCha8 10.610n ± 0% 5.129n ± 0% -51.66% (p=0.000 n=10) ChaCha8Read 51.30n ± 0% 28.05n ± 0% -45.31% (p=0.000 n=10) Block 218.50n ± 0% 45.48n ± 0% -79.19% (p=0.000 n=10) geomean 57.86n 32.05n -44.62% Benchmark result on Loongson-3A5000: goos: linux goarch: loong64 pkg: math/rand/v2 + internal/chacha8rand cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | ChaCha8MarshalBinary 116.3n ± 0% 116.6n ± 0% +0.26% (p=0.015 n=10) ChaCha8MarshalBinaryRead 142.6n ± 0% 142.0n ± 0% -0.39% (p=0.000 n=10) ChaCha8 16.270n ± 0% 6.848n ± 0% -57.91% (p=0.000 n=10) ChaCha8Read 78.65n ± 0% 47.39n ± 1% -39.74% (p=0.000 n=10) Block 301.50n ± 0% 91.85n ± 0% -69.53% (p=0.000 n=10) geomean 91.45n 54.79n -40.09% Change-Id: I64d80c81d2df288fecff80ae23ef89f0fb54cdfa Reviewed-on: https://go-review.googlesource.com/c/go/+/664035 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
005d7f29d1
commit
b8ed752d6f
@ -68,7 +68,7 @@ var depsRules = `
|
|||||||
unicode/utf16;
|
unicode/utf16;
|
||||||
|
|
||||||
internal/goarch < internal/abi;
|
internal/goarch < internal/abi;
|
||||||
internal/byteorder, internal/goarch < internal/chacha8rand;
|
internal/byteorder, internal/cpu, internal/goarch < internal/chacha8rand;
|
||||||
|
|
||||||
# RUNTIME is the core runtime group of packages, all of them very light-weight.
|
# RUNTIME is the core runtime group of packages, all of them very light-weight.
|
||||||
internal/abi,
|
internal/abi,
|
||||||
|
@ -7,7 +7,16 @@
|
|||||||
// and must have minimal dependencies.
|
// and must have minimal dependencies.
|
||||||
package chacha8rand
|
package chacha8rand
|
||||||
|
|
||||||
import "internal/byteorder"
|
import (
|
||||||
|
"internal/byteorder"
|
||||||
|
"internal/cpu"
|
||||||
|
"unsafe"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Offsets into internal/cpu records for use in assembly.
|
||||||
|
const (
|
||||||
|
offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
|
||||||
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
ctrInc = 4 // increment counter by 4 between block calls
|
ctrInc = 4 // increment counter by 4 between block calls
|
||||||
|
145
src/internal/chacha8rand/chacha8_loong64.s
Normal file
145
src/internal/chacha8rand/chacha8_loong64.s
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
// Copyright 2025 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#include "go_asm.h"
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
DATA ·chachaConst+0x00(SB)/4, $0x61707865
|
||||||
|
DATA ·chachaConst+0x04(SB)/4, $0x3320646e
|
||||||
|
DATA ·chachaConst+0x08(SB)/4, $0x79622d32
|
||||||
|
DATA ·chachaConst+0x0c(SB)/4, $0x6b206574
|
||||||
|
GLOBL ·chachaConst(SB), NOPTR|RODATA, $32
|
||||||
|
|
||||||
|
DATA ·chachaIncRot+0x00(SB)/4, $0x00000000
|
||||||
|
DATA ·chachaIncRot+0x04(SB)/4, $0x00000001
|
||||||
|
DATA ·chachaIncRot+0x08(SB)/4, $0x00000002
|
||||||
|
DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003
|
||||||
|
GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32
|
||||||
|
|
||||||
|
// QR is the ChaCha8 quarter-round on a, b, c, and d.
|
||||||
|
#define QR(a, b, c, d) \
|
||||||
|
VADDW a, b, a; \
|
||||||
|
VXORV d, a, d; \
|
||||||
|
VROTRW $16, d; \
|
||||||
|
VADDW c, d, c; \
|
||||||
|
VXORV b, c, b; \
|
||||||
|
VROTRW $20, b; \
|
||||||
|
VADDW a, b, a; \
|
||||||
|
VXORV d, a, d; \
|
||||||
|
VROTRW $24, d; \
|
||||||
|
VADDW c, d, c; \
|
||||||
|
VXORV b, c, b; \
|
||||||
|
VROTRW $25, b
|
||||||
|
|
||||||
|
|
||||||
|
// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
|
||||||
|
TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
|
||||||
|
// seed in R4
|
||||||
|
// blocks in R5
|
||||||
|
// counter in R6
|
||||||
|
|
||||||
|
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
|
||||||
|
BNE R7, lsx_chacha8
|
||||||
|
JMP ·block_generic<ABIInternal>(SB)
|
||||||
|
RET
|
||||||
|
|
||||||
|
lsx_chacha8:
|
||||||
|
MOVV $·chachaConst(SB), R10
|
||||||
|
MOVV $·chachaIncRot(SB), R11
|
||||||
|
|
||||||
|
// load contants
|
||||||
|
// VLDREPL.W $0, R10, V0
|
||||||
|
WORD $0x30200140
|
||||||
|
// VLDREPL.W $1, R10, V1
|
||||||
|
WORD $0x30200541
|
||||||
|
// VLDREPL.W $2, R10, V2
|
||||||
|
WORD $0x30200942
|
||||||
|
// VLDREPL.W $3, R10, V3
|
||||||
|
WORD $0x30200d43
|
||||||
|
|
||||||
|
// load 4-32bit data from incRotMatrix added to counter
|
||||||
|
VMOVQ (R11), V30
|
||||||
|
|
||||||
|
// load seed
|
||||||
|
// VLDREPL.W $0, R4, V4
|
||||||
|
WORD $0x30200084
|
||||||
|
// VLDREPL.W $1, R4, V5
|
||||||
|
WORD $0x30200485
|
||||||
|
// VLDREPL.W $2, R4, V6
|
||||||
|
WORD $0x30200886
|
||||||
|
// VLDREPL.W $3, R4, V7
|
||||||
|
WORD $0x30200c87
|
||||||
|
// VLDREPL.W $4, R4, V8
|
||||||
|
WORD $0x30201088
|
||||||
|
// VLDREPL.W $5, R4, V9
|
||||||
|
WORD $0x30201489
|
||||||
|
// VLDREPL.W $6, R4, V10
|
||||||
|
WORD $0x3020188a
|
||||||
|
// VLDREPL.W $7, R4, V11
|
||||||
|
WORD $0x30201c8b
|
||||||
|
|
||||||
|
// load counter and update counter
|
||||||
|
VMOVQ R6, V12.W4
|
||||||
|
VADDW V12, V30, V12
|
||||||
|
|
||||||
|
// zeros for remaining three matrix entries
|
||||||
|
VXORV V13, V13, V13
|
||||||
|
VXORV V14, V14, V14
|
||||||
|
VXORV V15, V15, V15
|
||||||
|
|
||||||
|
// save seed state for adding back later
|
||||||
|
VORV V4, V13, V20
|
||||||
|
VORV V5, V13, V21
|
||||||
|
VORV V6, V13, V22
|
||||||
|
VORV V7, V13, V23
|
||||||
|
VORV V8, V13, V24
|
||||||
|
VORV V9, V13, V25
|
||||||
|
VORV V10, V13, V26
|
||||||
|
VORV V11, V13, V27
|
||||||
|
|
||||||
|
// 4 iterations. Each iteration is 8 quarter-rounds.
|
||||||
|
MOVV $4, R7
|
||||||
|
loop:
|
||||||
|
QR(V0, V4, V8, V12)
|
||||||
|
QR(V1, V5, V9, V13)
|
||||||
|
QR(V2, V6, V10, V14)
|
||||||
|
QR(V3, V7, V11, V15)
|
||||||
|
|
||||||
|
QR(V0, V5, V10, V15)
|
||||||
|
QR(V1, V6, V11, V12)
|
||||||
|
QR(V2, V7, V8, V13)
|
||||||
|
QR(V3, V4, V9, V14)
|
||||||
|
|
||||||
|
SUBV $1, R7
|
||||||
|
BNE R7, R0, loop
|
||||||
|
|
||||||
|
// add seed back
|
||||||
|
VADDW V4, V20, V4
|
||||||
|
VADDW V5, V21, V5
|
||||||
|
VADDW V6, V22, V6
|
||||||
|
VADDW V7, V23, V7
|
||||||
|
VADDW V8, V24, V8
|
||||||
|
VADDW V9, V25, V9
|
||||||
|
VADDW V10, V26, V10
|
||||||
|
VADDW V11, V27, V11
|
||||||
|
|
||||||
|
// store blocks back to output buffer
|
||||||
|
VMOVQ V0, (R5)
|
||||||
|
VMOVQ V1, 16(R5)
|
||||||
|
VMOVQ V2, 32(R5)
|
||||||
|
VMOVQ V3, 48(R5)
|
||||||
|
VMOVQ V4, 64(R5)
|
||||||
|
VMOVQ V5, 80(R5)
|
||||||
|
VMOVQ V6, 96(R5)
|
||||||
|
VMOVQ V7, 112(R5)
|
||||||
|
VMOVQ V8, 128(R5)
|
||||||
|
VMOVQ V9, 144(R5)
|
||||||
|
VMOVQ V10, 160(R5)
|
||||||
|
VMOVQ V11, 176(R5)
|
||||||
|
VMOVQ V12, 192(R5)
|
||||||
|
VMOVQ V13, 208(R5)
|
||||||
|
VMOVQ V14, 224(R5)
|
||||||
|
VMOVQ V15, 240(R5)
|
||||||
|
|
||||||
|
RET
|
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
//go:build !amd64 && !arm64
|
//go:build !amd64 && !arm64 && !loong64
|
||||||
|
|
||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user