mirror of
https://github.com/golang/go.git
synced 2025-05-31 23:25:39 +00:00
cmd/compile: intrinsify math.{Trunc/Ceil/Floor} on amd64
This significantly speed-ups Trunc. Ceil/Floor are using the same instruction, so do them too. name old time/op new time/op delta Floor-6 3.33ns ± 1% 3.22ns ± 0% -3.39% (p=0.000 n=10+10) Ceil-6 3.33ns ± 1% 3.22ns ± 0% -3.16% (p=0.000 n=10+7) Trunc-6 4.83ns ± 0% 3.22ns ± 0% -33.36% (p=0.000 n=6+8) Change-Id: If848790e458eedfe38a6a0407bb4f589c68ac254 Reviewed-on: https://go-review.googlesource.com/68630 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
8684534321
commit
94484d8ed5
@ -852,6 +852,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
||||
p.From.Reg = v.Args[0].Reg()
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = v.Reg()
|
||||
case ssa.OpAMD64ROUNDSD:
|
||||
p := s.Prog(v.Op.Asm())
|
||||
val := v.AuxInt
|
||||
// 1 means math.Floor, 2 Ceil, 3 Trunc
|
||||
if val != 1 && val != 2 && val != 3 {
|
||||
v.Fatalf("Invalid rounding mode")
|
||||
}
|
||||
p.From.Offset = val
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = v.Reg()
|
||||
case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
|
||||
if v.Args[0].Reg() != v.Reg() {
|
||||
// POPCNT on Intel has a false dependency on the destination register.
|
||||
|
@ -146,6 +146,7 @@ var runtimeDecls = [...]struct {
|
||||
{"msanread", funcTag, 112},
|
||||
{"msanwrite", funcTag, 112},
|
||||
{"support_popcnt", varTag, 11},
|
||||
{"support_sse41", varTag, 11},
|
||||
}
|
||||
|
||||
func runtimeTypes() []*types.Type {
|
||||
|
@ -191,3 +191,4 @@ func msanwrite(addr, size uintptr)
|
||||
|
||||
// architecture variants
|
||||
var support_popcnt bool
|
||||
var support_sse41 bool
|
||||
|
@ -2823,6 +2823,47 @@ func init() {
|
||||
},
|
||||
sys.PPC64)
|
||||
|
||||
makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||
aux := syslook("support_sse41").Sym.Linksym()
|
||||
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), aux, s.sb)
|
||||
v := s.newValue2(ssa.OpLoad, types.Types[TBOOL], addr, s.mem())
|
||||
b := s.endBlock()
|
||||
b.Kind = ssa.BlockIf
|
||||
b.SetControl(v)
|
||||
bTrue := s.f.NewBlock(ssa.BlockPlain)
|
||||
bFalse := s.f.NewBlock(ssa.BlockPlain)
|
||||
bEnd := s.f.NewBlock(ssa.BlockPlain)
|
||||
b.AddEdgeTo(bTrue)
|
||||
b.AddEdgeTo(bFalse)
|
||||
b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
|
||||
|
||||
// We have the intrinsic - use it directly.
|
||||
s.startBlock(bTrue)
|
||||
s.vars[n] = s.newValue1(op, types.Types[TFLOAT64], args[0])
|
||||
s.endBlock().AddEdgeTo(bEnd)
|
||||
|
||||
// Call the pure Go version.
|
||||
s.startBlock(bFalse)
|
||||
a := s.call(n, callNormal)
|
||||
s.vars[n] = s.newValue2(ssa.OpLoad, types.Types[TFLOAT64], a, s.mem())
|
||||
s.endBlock().AddEdgeTo(bEnd)
|
||||
|
||||
// Merge results.
|
||||
s.startBlock(bEnd)
|
||||
return s.variable(n, types.Types[TFLOAT64])
|
||||
}
|
||||
}
|
||||
addF("math", "Floor",
|
||||
makeRoundAMD64(ssa.OpFloor),
|
||||
sys.AMD64)
|
||||
addF("math", "Ceil",
|
||||
makeRoundAMD64(ssa.OpCeil),
|
||||
sys.AMD64)
|
||||
addF("math", "Trunc",
|
||||
makeRoundAMD64(ssa.OpTrunc),
|
||||
sys.AMD64)
|
||||
|
||||
/******** math/bits ********/
|
||||
addF("math/bits", "TrailingZeros64",
|
||||
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||
|
@ -113,6 +113,10 @@
|
||||
|
||||
(Sqrt x) -> (SQRTSD x)
|
||||
|
||||
(Floor x) -> (ROUNDSD [1] x)
|
||||
(Ceil x) -> (ROUNDSD [2] x)
|
||||
(Trunc x) -> (ROUNDSD [3] x)
|
||||
|
||||
// Lowering extension
|
||||
// Note: we always extend to 64 bits even though some ops don't need that many result bits.
|
||||
(SignExt8to16 x) -> (MOVBQSX x)
|
||||
|
@ -341,6 +341,10 @@ func init() {
|
||||
|
||||
{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
|
||||
|
||||
// ROUNDSD instruction isn't guaranteed to be on the target platform (it is SSE4.1)
|
||||
// Any use must be preceded by a successful check of runtime.support_sse41.
|
||||
{name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, // rounds arg0 depending on auxint, 1 means math.Floor, 2 Ceil, 3 Trunc
|
||||
|
||||
{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
|
||||
{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
|
||||
// Note: SBBW and SBBB are subsumed by SBBL
|
||||
|
@ -557,6 +557,7 @@ const (
|
||||
OpAMD64POPCNTQ
|
||||
OpAMD64POPCNTL
|
||||
OpAMD64SQRTSD
|
||||
OpAMD64ROUNDSD
|
||||
OpAMD64SBBQcarrymask
|
||||
OpAMD64SBBLcarrymask
|
||||
OpAMD64SETEQ
|
||||
@ -6716,6 +6717,20 @@ var opcodeTable = [...]opInfo{
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "ROUNDSD",
|
||||
auxType: auxInt8,
|
||||
argLen: 1,
|
||||
asm: x86.AROUNDSD,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "SBBQcarrymask",
|
||||
argLen: 1,
|
||||
|
@ -473,6 +473,8 @@ func rewriteValueAMD64(v *Value) bool {
|
||||
return rewriteValueAMD64_OpBswap32_0(v)
|
||||
case OpBswap64:
|
||||
return rewriteValueAMD64_OpBswap64_0(v)
|
||||
case OpCeil:
|
||||
return rewriteValueAMD64_OpCeil_0(v)
|
||||
case OpClosureCall:
|
||||
return rewriteValueAMD64_OpClosureCall_0(v)
|
||||
case OpCom16:
|
||||
@ -563,6 +565,8 @@ func rewriteValueAMD64(v *Value) bool {
|
||||
return rewriteValueAMD64_OpEqB_0(v)
|
||||
case OpEqPtr:
|
||||
return rewriteValueAMD64_OpEqPtr_0(v)
|
||||
case OpFloor:
|
||||
return rewriteValueAMD64_OpFloor_0(v)
|
||||
case OpGeq16:
|
||||
return rewriteValueAMD64_OpGeq16_0(v)
|
||||
case OpGeq16U:
|
||||
@ -893,6 +897,8 @@ func rewriteValueAMD64(v *Value) bool {
|
||||
return rewriteValueAMD64_OpSub8_0(v)
|
||||
case OpSubPtr:
|
||||
return rewriteValueAMD64_OpSubPtr_0(v)
|
||||
case OpTrunc:
|
||||
return rewriteValueAMD64_OpTrunc_0(v)
|
||||
case OpTrunc16to8:
|
||||
return rewriteValueAMD64_OpTrunc16to8_0(v)
|
||||
case OpTrunc32to16:
|
||||
@ -42210,6 +42216,18 @@ func rewriteValueAMD64_OpBswap64_0(v *Value) bool {
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueAMD64_OpCeil_0(v *Value) bool {
|
||||
// match: (Ceil x)
|
||||
// cond:
|
||||
// result: (ROUNDSD [2] x)
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpAMD64ROUNDSD)
|
||||
v.AuxInt = 2
|
||||
v.AddArg(x)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueAMD64_OpClosureCall_0(v *Value) bool {
|
||||
// match: (ClosureCall [argwid] entry closure mem)
|
||||
// cond:
|
||||
@ -42958,6 +42976,18 @@ func rewriteValueAMD64_OpEqPtr_0(v *Value) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpFloor_0(v *Value) bool {
|
||||
// match: (Floor x)
|
||||
// cond:
|
||||
// result: (ROUNDSD [1] x)
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpAMD64ROUNDSD)
|
||||
v.AuxInt = 1
|
||||
v.AddArg(x)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueAMD64_OpGeq16_0(v *Value) bool {
|
||||
b := v.Block
|
||||
_ = b
|
||||
@ -46885,6 +46915,18 @@ func rewriteValueAMD64_OpSubPtr_0(v *Value) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpTrunc_0(v *Value) bool {
|
||||
// match: (Trunc x)
|
||||
// cond:
|
||||
// result: (ROUNDSD [3] x)
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpAMD64ROUNDSD)
|
||||
v.AuxInt = 3
|
||||
v.AddArg(x)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueAMD64_OpTrunc16to8_0(v *Value) bool {
|
||||
// match: (Trunc16to8 x)
|
||||
// cond:
|
||||
|
@ -8,12 +8,6 @@
|
||||
|
||||
// func Floor(x float64) float64
|
||||
TEXT ·Floor(SB),NOSPLIT,$0
|
||||
CMPB ·useSSE41(SB), $1
|
||||
JNE nosse4
|
||||
ROUNDSD $1, x+0(FP), X0
|
||||
MOVQ X0, ret+8(FP)
|
||||
RET
|
||||
nosse4:
|
||||
MOVQ x+0(FP), AX
|
||||
MOVQ $~(1<<63), DX // sign bit mask
|
||||
ANDQ AX,DX // DX = |x|
|
||||
@ -36,12 +30,6 @@ isBig_floor:
|
||||
|
||||
// func Ceil(x float64) float64
|
||||
TEXT ·Ceil(SB),NOSPLIT,$0
|
||||
CMPB ·useSSE41(SB), $1
|
||||
JNE nosse4
|
||||
ROUNDSD $2, x+0(FP), X0
|
||||
MOVQ X0, ret+8(FP)
|
||||
RET
|
||||
nosse4:
|
||||
MOVQ x+0(FP), AX
|
||||
MOVQ $~(1<<63), DX // sign bit mask
|
||||
MOVQ AX, BX // BX = copy of x
|
||||
|
Loading…
x
Reference in New Issue
Block a user