cmd/compile: intrinsify math/bits.Bswap on riscv64

For riscv64/rva22u64 and above, we can intrinsify math/bits.Bswap
using the REV8 machine instruction.

On a StarFive VisionFive 2 with GORISCV64=rva22u64:

                 │     rb.1     │                rb.2                 │
                 │    sec/op    │   sec/op     vs base                │
ReverseBytes-4     18.790n ± 0%   4.026n ± 0%  -78.57% (p=0.000 n=10)
ReverseBytes16-4    6.710n ± 0%   5.368n ± 0%  -20.00% (p=0.000 n=10)
ReverseBytes32-4   13.420n ± 0%   5.368n ± 0%  -60.00% (p=0.000 n=10)
ReverseBytes64-4   17.450n ± 0%   4.026n ± 0%  -76.93% (p=0.000 n=10)
geomean             13.11n        4.649n       -64.54%

Change-Id: I26eee34270b1721f7304bb1cddb0fda129b20ece
Reviewed-on: https://go-review.googlesource.com/c/go/+/660855
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
This commit is contained in:
Joel Sing 2025-03-20 01:09:23 +11:00
parent 6fc1e34100
commit 90e8b8cdae
8 changed files with 118 additions and 20 deletions

View File

@ -419,7 +419,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVDX,
ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS, ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS,
ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD, ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD,
ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW: ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW,
ssa.OpRISCV64REV8:
p := s.Prog(v.Op.Asm()) p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg() p.From.Reg = v.Args[0].Reg()

View File

@ -231,6 +231,11 @@
(BitLen16 x) => (BitLen64 (ZeroExt16to64 x)) (BitLen16 x) => (BitLen64 (ZeroExt16to64 x))
(BitLen8 x) => (BitLen64 (ZeroExt8to64 x)) (BitLen8 x) => (BitLen64 (ZeroExt8to64 x))
// Byte swap (note that these will only be emitted for rva22u64 and above).
(Bswap64 ...) => (REV8 ...)
(Bswap32 <t> x) => (SRLI [32] (REV8 <t> x))
(Bswap16 <t> x) => (SRLI [48] (REV8 <t> x))
(Less64 ...) => (SLT ...) (Less64 ...) => (SLT ...)
(Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y)) (Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y))
(Less16 x y) => (SLT (SignExt16to64 x) (SignExt16to64 y)) (Less16 x y) => (SLT (SignExt16to64 x) (SignExt16to64 y))

View File

@ -237,6 +237,7 @@ func init() {
{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1
{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // ^arg0 | arg1 {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // ^arg0 | arg1
{name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint
{name: "REV8", argLength: 1, reg: gp11, asm: "REV8"}, // reverse bytes
{name: "ROL", argLength: 2, reg: gp21, asm: "ROL"}, // rotate left arg0 by (arg1 & 63) {name: "ROL", argLength: 2, reg: gp21, asm: "ROL"}, // rotate left arg0 by (arg1 & 63)
{name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"}, // rotate left least significant word of arg0 by (arg1 & 31), sign extended {name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"}, // rotate left least significant word of arg0 by (arg1 & 31), sign extended
{name: "ROR", argLength: 2, reg: gp21, asm: "ROR"}, // rotate right arg0 by (arg1 & 63) {name: "ROR", argLength: 2, reg: gp21, asm: "ROR"}, // rotate right arg0 by (arg1 & 63)

View File

@ -2520,6 +2520,7 @@ const (
OpRISCV64OR OpRISCV64OR
OpRISCV64ORN OpRISCV64ORN
OpRISCV64ORI OpRISCV64ORI
OpRISCV64REV8
OpRISCV64ROL OpRISCV64ROL
OpRISCV64ROLW OpRISCV64ROLW
OpRISCV64ROR OpRISCV64ROR
@ -33968,6 +33969,19 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "REV8",
argLen: 1,
asm: riscv.AREV8,
reg: regInfo{
inputs: []inputInfo{
{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
},
outputs: []outputInfo{
{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
},
},
},
{ {
name: "ROL", name: "ROL",
argLen: 2, argLen: 2,

View File

@ -110,6 +110,13 @@ func rewriteValueRISCV64(v *Value) bool {
return rewriteValueRISCV64_OpBitLen64(v) return rewriteValueRISCV64_OpBitLen64(v)
case OpBitLen8: case OpBitLen8:
return rewriteValueRISCV64_OpBitLen8(v) return rewriteValueRISCV64_OpBitLen8(v)
case OpBswap16:
return rewriteValueRISCV64_OpBswap16(v)
case OpBswap32:
return rewriteValueRISCV64_OpBswap32(v)
case OpBswap64:
v.Op = OpRISCV64REV8
return true
case OpClosureCall: case OpClosureCall:
v.Op = OpRISCV64CALLclosure v.Op = OpRISCV64CALLclosure
return true return true
@ -1002,6 +1009,38 @@ func rewriteValueRISCV64_OpBitLen8(v *Value) bool {
return true return true
} }
} }
func rewriteValueRISCV64_OpBswap16(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
// match: (Bswap16 <t> x)
// result: (SRLI [48] (REV8 <t> x))
for {
t := v.Type
x := v_0
v.reset(OpRISCV64SRLI)
v.AuxInt = int64ToAuxInt(48)
v0 := b.NewValue0(v.Pos, OpRISCV64REV8, t)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueRISCV64_OpBswap32(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
// match: (Bswap32 <t> x)
// result: (SRLI [32] (REV8 <t> x))
for {
t := v.Type
x := v_0
v.reset(OpRISCV64SRLI)
v.AuxInt = int64ToAuxInt(32)
v0 := b.NewValue0(v.Pos, OpRISCV64REV8, t)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueRISCV64_OpConst16(v *Value) bool { func rewriteValueRISCV64_OpConst16(v *Value) bool {
// match: (Const16 [val]) // match: (Const16 [val])
// result: (MOVDconst [int64(val)]) // result: (MOVDconst [int64(val)])

View File

@ -184,22 +184,44 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
}, },
all...) all...)
brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X}
if cfg.goppc64 >= 10 {
// Use only on Power10 as the new byte reverse instructions that Power10 provide
// make it worthwhile as an intrinsic
brev_arch = append(brev_arch, sys.PPC64)
}
addF("internal/runtime/sys", "Bswap32", addF("internal/runtime/sys", "Bswap32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0]) return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
}, },
brev_arch...) sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
addF("internal/runtime/sys", "Bswap64", addF("internal/runtime/sys", "Bswap64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0]) return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
}, },
brev_arch...) sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
if cfg.goppc64 >= 10 {
// Use only on Power10 as the new byte reverse instructions that Power10 provide
// make it worthwhile as an intrinsic
addF("internal/runtime/sys", "Bswap32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
},
sys.PPC64)
addF("internal/runtime/sys", "Bswap64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
},
sys.PPC64)
}
if cfg.goriscv64 >= 22 {
addF("internal/runtime/sys", "Bswap32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
},
sys.RISCV64)
addF("internal/runtime/sys", "Bswap64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
},
sys.RISCV64)
}
/****** Prefetch ******/ /****** Prefetch ******/
makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
@ -924,23 +946,30 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
sys.RISCV64) sys.RISCV64)
} }
// ReverseBytes inlines correctly, no need to intrinsify it.
alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...) alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...) alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
addF("math/bits", "ReverseBytes16", addF("math/bits", "ReverseBytes16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0]) return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
}, },
sys.Loong64) sys.Loong64)
// ReverseBytes inlines correctly, no need to intrinsify it.
// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
// On Power10, 16-bit rotate is not available so use BRH instruction
if cfg.goppc64 >= 10 { if cfg.goppc64 >= 10 {
// On Power10, 16-bit rotate is not available so use BRH instruction
addF("math/bits", "ReverseBytes16", addF("math/bits", "ReverseBytes16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0]) return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
}, },
sys.PPC64) sys.PPC64)
} }
if cfg.goriscv64 >= 22 {
addF("math/bits", "ReverseBytes16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
},
sys.RISCV64)
}
addF("math/bits", "Len64", addF("math/bits", "Len64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {

View File

@ -1107,6 +1107,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"riscv64", "internal/runtime/math", "Add64"}: struct{}{}, {"riscv64", "internal/runtime/math", "Add64"}: struct{}{},
{"riscv64", "internal/runtime/math", "Mul64"}: struct{}{}, {"riscv64", "internal/runtime/math", "Mul64"}: struct{}{},
{"riscv64", "internal/runtime/math", "MulUintptr"}: struct{}{}, {"riscv64", "internal/runtime/math", "MulUintptr"}: struct{}{},
{"riscv64", "internal/runtime/sys", "Bswap32"}: struct{}{},
{"riscv64", "internal/runtime/sys", "Bswap64"}: struct{}{},
{"riscv64", "internal/runtime/sys", "GetCallerPC"}: struct{}{}, {"riscv64", "internal/runtime/sys", "GetCallerPC"}: struct{}{},
{"riscv64", "internal/runtime/sys", "GetCallerSP"}: struct{}{}, {"riscv64", "internal/runtime/sys", "GetCallerSP"}: struct{}{},
{"riscv64", "internal/runtime/sys", "GetClosurePtr"}: struct{}{}, {"riscv64", "internal/runtime/sys", "GetClosurePtr"}: struct{}{},
@ -1129,6 +1131,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"riscv64", "math/bits", "Len8"}: struct{}{}, {"riscv64", "math/bits", "Len8"}: struct{}{},
{"riscv64", "math/bits", "Mul"}: struct{}{}, {"riscv64", "math/bits", "Mul"}: struct{}{},
{"riscv64", "math/bits", "Mul64"}: struct{}{}, {"riscv64", "math/bits", "Mul64"}: struct{}{},
{"riscv64", "math/bits", "ReverseBytes16"}: struct{}{},
{"riscv64", "math/bits", "ReverseBytes32"}: struct{}{},
{"riscv64", "math/bits", "ReverseBytes64"}: struct{}{},
{"riscv64", "math/bits", "RotateLeft"}: struct{}{}, {"riscv64", "math/bits", "RotateLeft"}: struct{}{},
{"riscv64", "math/bits", "RotateLeft16"}: struct{}{}, {"riscv64", "math/bits", "RotateLeft16"}: struct{}{},
{"riscv64", "math/bits", "RotateLeft32"}: struct{}{}, {"riscv64", "math/bits", "RotateLeft32"}: struct{}{},

View File

@ -261,42 +261,46 @@ func Reverse8(n uint8) uint8 {
// ----------------------- // // ----------------------- //
func ReverseBytes(n uint) uint { func ReverseBytes(n uint) uint {
// amd64:"BSWAPQ"
// 386:"BSWAPL" // 386:"BSWAPL"
// s390x:"MOVDBR" // amd64:"BSWAPQ"
// arm64:"REV" // arm64:"REV"
// loong64:"REVBV" // loong64:"REVBV"
// riscv64/rva22u64,riscv64/rva23u64:"REV8"
// s390x:"MOVDBR"
return bits.ReverseBytes(n) return bits.ReverseBytes(n)
} }
func ReverseBytes64(n uint64) uint64 { func ReverseBytes64(n uint64) uint64 {
// amd64:"BSWAPQ"
// 386:"BSWAPL" // 386:"BSWAPL"
// s390x:"MOVDBR" // amd64:"BSWAPQ"
// arm64:"REV" // arm64:"REV"
// ppc64x/power10: "BRD"
// loong64:"REVBV" // loong64:"REVBV"
// ppc64x/power10: "BRD"
// riscv64/rva22u64,riscv64/rva23u64:"REV8"
// s390x:"MOVDBR"
return bits.ReverseBytes64(n) return bits.ReverseBytes64(n)
} }
func ReverseBytes32(n uint32) uint32 { func ReverseBytes32(n uint32) uint32 {
// amd64:"BSWAPL"
// 386:"BSWAPL" // 386:"BSWAPL"
// s390x:"MOVWBR" // amd64:"BSWAPL"
// arm64:"REVW" // arm64:"REVW"
// loong64:"REVB2W" // loong64:"REVB2W"
// ppc64x/power10: "BRW" // ppc64x/power10: "BRW"
// riscv64/rva22u64,riscv64/rva23u64:"REV8","SRLI\t\\$32"
// s390x:"MOVWBR"
return bits.ReverseBytes32(n) return bits.ReverseBytes32(n)
} }
func ReverseBytes16(n uint16) uint16 { func ReverseBytes16(n uint16) uint16 {
// amd64:"ROLW" // amd64:"ROLW"
// arm64:"REV16W",-"UBFX",-"ORR"
// arm/5:"SLL","SRL","ORR" // arm/5:"SLL","SRL","ORR"
// arm/6:"REV16" // arm/6:"REV16"
// arm/7:"REV16" // arm/7:"REV16"
// arm64:"REV16W",-"UBFX",-"ORR"
// loong64:"REVB2H" // loong64:"REVB2H"
// ppc64x/power10: "BRH" // ppc64x/power10: "BRH"
// riscv64/rva22u64,riscv64/rva23u64:"REV8","SRLI\t\\$48"
return bits.ReverseBytes16(n) return bits.ReverseBytes16(n)
} }