mirror of
https://github.com/golang/go.git
synced 2025-05-05 15:43:04 +00:00
There is a generic opcode for FMA, but we don't use it in rewrite rules. This is maybe because some archs, like WASM and MIPS don't have a late lowering rule for it. Fixes #71204 Intel Alder Lake 12600k (GOAMD64=v3): math: name old time/op new time/op delta Acos-16 4.58ns ± 0% 3.36ns ± 0% -26.68% (p=0.008 n=5+5) Acosh-16 8.04ns ± 1% 6.44ns ± 0% -19.95% (p=0.008 n=5+5) Asin-16 4.28ns ± 0% 3.32ns ± 0% -22.24% (p=0.008 n=5+5) Asinh-16 9.92ns ± 0% 8.62ns ± 0% -13.13% (p=0.008 n=5+5) Atan-16 2.31ns ± 0% 1.84ns ± 0% -20.02% (p=0.008 n=5+5) Atanh-16 7.79ns ± 0% 7.03ns ± 0% -9.67% (p=0.008 n=5+5) Atan2-16 3.93ns ± 0% 3.52ns ± 0% -10.35% (p=0.000 n=5+4) Cbrt-16 4.62ns ± 0% 4.41ns ± 0% -4.57% (p=0.016 n=4+5) Ceil-16 0.14ns ± 1% 0.14ns ± 2% ~ (p=0.103 n=5+5) Copysign-16 0.33ns ± 0% 0.33ns ± 0% +0.03% (p=0.029 n=4+4) Cos-16 4.87ns ± 0% 4.75ns ± 0% -2.44% (p=0.016 n=5+4) Cosh-16 4.86ns ± 0% 4.86ns ± 0% ~ (p=0.317 n=5+5) Erf-16 2.71ns ± 0% 2.25ns ± 0% -16.69% (p=0.008 n=5+5) Erfc-16 3.06ns ± 0% 2.67ns ± 0% -13.00% (p=0.016 n=5+4) Erfinv-16 3.88ns ± 0% 2.84ns ± 3% -26.83% (p=0.008 n=5+5) Erfcinv-16 4.08ns ± 0% 3.01ns ± 1% -26.27% (p=0.008 n=5+5) Exp-16 3.29ns ± 0% 3.37ns ± 2% +2.64% (p=0.016 n=4+5) ExpGo-16 8.44ns ± 0% 7.48ns ± 1% -11.37% (p=0.008 n=5+5) Expm1-16 4.46ns ± 0% 3.69ns ± 2% -17.26% (p=0.016 n=4+5) Exp2-16 8.20ns ± 0% 7.39ns ± 2% -9.94% (p=0.008 n=5+5) Exp2Go-16 8.26ns ± 0% 7.23ns ± 0% -12.49% (p=0.016 n=4+5) Abs-16 0.26ns ± 3% 0.22ns ± 1% -16.34% (p=0.008 n=5+5) Dim-16 0.38ns ± 1% 0.40ns ± 2% +5.02% (p=0.008 n=5+5) Floor-16 0.11ns ± 1% 0.17ns ± 4% +54.99% (p=0.008 n=5+5) Max-16 1.24ns ± 0% 1.24ns ± 0% ~ (p=0.619 n=5+5) Min-16 1.24ns ± 0% 1.24ns ± 0% ~ (p=0.484 n=5+5) Mod-16 13.4ns ± 1% 12.8ns ± 0% -4.21% (p=0.016 n=5+4) Frexp-16 1.70ns ± 0% 1.71ns ± 0% +0.46% (p=0.008 n=5+5) Gamma-16 3.97ns ± 0% 3.97ns ± 0% ~ (p=0.643 n=5+5) Hypot-16 2.11ns ± 0% 2.11ns ± 0% ~ (p=0.762 n=5+5) HypotGo-16 2.48ns ± 4% 2.26ns ± 0% -8.94% (p=0.008 n=5+5) Ilogb-16 1.67ns ± 0% 1.67ns ± 0% -0.07% (p=0.048 n=5+5) J0-16 19.8ns ± 0% 19.3ns ± 0% ~ (p=0.079 n=4+5) J1-16 19.4ns ± 0% 18.9ns ± 0% -2.63% (p=0.000 n=5+4) Jn-16 41.5ns ± 0% 40.6ns ± 0% -2.32% (p=0.016 n=4+5) Ldexp-16 2.26ns ± 0% 2.26ns ± 0% ~ (p=0.683 n=5+5) Lgamma-16 4.40ns ± 0% 4.21ns ± 0% -4.21% (p=0.008 n=5+5) Log-16 4.05ns ± 0% 4.05ns ± 0% ~ (all equal) Logb-16 1.69ns ± 0% 1.69ns ± 0% ~ (p=0.429 n=5+5) Log1p-16 5.00ns ± 0% 3.99ns ± 0% -20.14% (p=0.008 n=5+5) Log10-16 4.22ns ± 0% 4.21ns ± 0% -0.15% (p=0.008 n=5+5) Log2-16 2.27ns ± 0% 2.25ns ± 0% -0.94% (p=0.008 n=5+5) Modf-16 1.44ns ± 0% 1.44ns ± 0% ~ (p=0.492 n=5+5) Nextafter32-16 2.09ns ± 0% 2.09ns ± 0% ~ (p=0.079 n=4+5) Nextafter64-16 2.09ns ± 0% 2.09ns ± 0% ~ (p=0.095 n=4+5) PowInt-16 10.8ns ± 0% 10.8ns ± 0% ~ (all equal) PowFrac-16 25.3ns ± 0% 25.3ns ± 0% -0.09% (p=0.000 n=5+4) Pow10Pos-16 0.52ns ± 1% 0.52ns ± 0% ~ (p=0.810 n=5+5) Pow10Neg-16 0.82ns ± 0% 0.82ns ± 0% ~ (p=0.381 n=5+5) Round-16 0.93ns ± 0% 0.93ns ± 0% ~ (p=0.056 n=5+5) RoundToEven-16 1.64ns ± 0% 1.64ns ± 0% ~ (all equal) Remainder-16 12.4ns ± 2% 12.0ns ± 0% -3.27% (p=0.008 n=5+5) Signbit-16 0.37ns ± 0% 0.37ns ± 0% -0.19% (p=0.008 n=5+5) Sin-16 4.04ns ± 0% 3.92ns ± 0% -3.13% (p=0.000 n=4+5) Sincos-16 5.99ns ± 0% 5.80ns ± 0% -3.03% (p=0.008 n=5+5) Sinh-16 5.22ns ± 0% 5.22ns ± 0% ~ (p=0.651 n=5+4) SqrtIndirect-16 0.41ns ± 0% 0.41ns ± 0% ~ (p=0.333 n=4+5) SqrtLatency-16 2.66ns ± 0% 2.66ns ± 0% ~ (p=0.079 n=4+5) SqrtIndirectLatency-16 2.66ns ± 0% 2.66ns ± 0% ~ (p=1.000 n=5+5) SqrtGoLatency-16 30.1ns ± 0% 28.6ns ± 1% -4.84% (p=0.008 n=5+5) SqrtPrime-16 645ns ± 0% 645ns ± 0% ~ (p=0.095 n=5+4) Tan-16 4.21ns ± 0% 4.09ns ± 0% -2.76% (p=0.029 n=4+4) Tanh-16 5.36ns ± 0% 5.36ns ± 0% ~ (p=0.444 n=5+5) Trunc-16 0.12ns ± 6% 0.11ns ± 1% -6.79% (p=0.008 n=5+5) Y0-16 19.2ns ± 0% 18.7ns ± 0% -2.52% (p=0.000 n=5+4) Y1-16 19.1ns ± 0% 18.4ns ± 0% ~ (p=0.079 n=4+5) Yn-16 40.7ns ± 0% 39.5ns ± 0% -2.82% (p=0.008 n=5+5) Float64bits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.603 n=5+5) Float64frombits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.984 n=4+5) Float32bits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.778 n=4+5) Float32frombits-16 0.21ns ± 0% 0.20ns ± 0% ~ (p=0.397 n=5+5) FMA-16 0.82ns ± 0% 0.82ns ± 0% +0.02% (p=0.029 n=4+4) [Geo mean] 2.87ns 2.74ns -4.61% math/cmplx: name old time/op new time/op delta Abs-16 2.07ns ± 0% 2.05ns ± 0% -0.70% (p=0.016 n=5+4) Acos-16 36.5ns ± 0% 35.7ns ± 0% -2.33% (p=0.029 n=4+4) Acosh-16 37.0ns ± 0% 36.2ns ± 0% -2.20% (p=0.008 n=5+5) Asin-16 36.5ns ± 0% 35.7ns ± 0% -2.29% (p=0.008 n=5+5) Asinh-16 33.5ns ± 0% 31.6ns ± 0% -5.51% (p=0.008 n=5+5) Atan-16 15.5ns ± 0% 13.9ns ± 0% -10.61% (p=0.008 n=5+5) Atanh-16 15.0ns ± 0% 13.6ns ± 0% -9.73% (p=0.008 n=5+5) Conj-16 0.11ns ± 5% 0.11ns ± 1% ~ (p=0.421 n=5+5) Cos-16 12.3ns ± 0% 12.2ns ± 0% -0.60% (p=0.000 n=4+5) Cosh-16 12.1ns ± 0% 12.0ns ± 0% ~ (p=0.079 n=4+5) Exp-16 10.0ns ± 0% 9.8ns ± 0% -1.77% (p=0.008 n=5+5) Log-16 14.5ns ± 0% 13.7ns ± 0% -5.67% (p=0.008 n=5+5) Log10-16 14.5ns ± 0% 13.7ns ± 0% -5.55% (p=0.000 n=5+4) Phase-16 5.11ns ± 0% 4.25ns ± 0% -16.90% (p=0.008 n=5+5) Polar-16 7.12ns ± 0% 6.35ns ± 0% -10.90% (p=0.008 n=5+5) Pow-16 64.3ns ± 0% 63.7ns ± 0% -0.97% (p=0.008 n=5+5) Rect-16 5.74ns ± 0% 5.58ns ± 0% -2.73% (p=0.016 n=4+5) Sin-16 12.2ns ± 0% 12.2ns ± 0% -0.54% (p=0.000 n=4+5) Sinh-16 12.1ns ± 0% 12.0ns ± 0% -0.58% (p=0.000 n=5+4) Sqrt-16 5.30ns ± 0% 5.18ns ± 0% -2.36% (p=0.008 n=5+5) Tan-16 22.7ns ± 0% 22.6ns ± 0% -0.33% (p=0.008 n=5+5) Tanh-16 21.2ns ± 0% 20.9ns ± 0% -1.32% (p=0.008 n=5+5) [Geo mean] 11.3ns 10.8ns -3.97% Change-Id: Idcc4b357ba68477929c126289e5095b27a827b1b Reviewed-on: https://go-review.googlesource.com/c/go/+/646335 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Keith Randall <khr@golang.org>
263 lines
6.5 KiB
Go
263 lines
6.5 KiB
Go
// asmcheck
|
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package codegen
|
|
|
|
import "math"
|
|
|
|
var sink64 [8]float64
|
|
|
|
func approx(x float64) {
|
|
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
|
|
// amd64:"ROUNDSD\t[$]2"
|
|
// s390x:"FIDBR\t[$]6"
|
|
// arm64:"FRINTPD"
|
|
// ppc64x:"FRIP"
|
|
// wasm:"F64Ceil"
|
|
sink64[0] = math.Ceil(x)
|
|
|
|
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
|
|
// amd64:"ROUNDSD\t[$]1"
|
|
// s390x:"FIDBR\t[$]7"
|
|
// arm64:"FRINTMD"
|
|
// ppc64x:"FRIM"
|
|
// wasm:"F64Floor"
|
|
sink64[1] = math.Floor(x)
|
|
|
|
// s390x:"FIDBR\t[$]1"
|
|
// arm64:"FRINTAD"
|
|
// ppc64x:"FRIN"
|
|
sink64[2] = math.Round(x)
|
|
|
|
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
|
|
// amd64:"ROUNDSD\t[$]3"
|
|
// s390x:"FIDBR\t[$]5"
|
|
// arm64:"FRINTZD"
|
|
// ppc64x:"FRIZ"
|
|
// wasm:"F64Trunc"
|
|
sink64[3] = math.Trunc(x)
|
|
|
|
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
|
|
// amd64:"ROUNDSD\t[$]0"
|
|
// s390x:"FIDBR\t[$]4"
|
|
// arm64:"FRINTND"
|
|
// wasm:"F64Nearest"
|
|
sink64[4] = math.RoundToEven(x)
|
|
}
|
|
|
|
func sqrt(x float64) float64 {
|
|
// amd64:"SQRTSD"
|
|
// 386/sse2:"SQRTSD" 386/softfloat:-"SQRTD"
|
|
// arm64:"FSQRTD"
|
|
// arm/7:"SQRTD"
|
|
// mips/hardfloat:"SQRTD" mips/softfloat:-"SQRTD"
|
|
// mips64/hardfloat:"SQRTD" mips64/softfloat:-"SQRTD"
|
|
// wasm:"F64Sqrt"
|
|
// ppc64x:"FSQRT"
|
|
// riscv64: "FSQRTD"
|
|
return math.Sqrt(x)
|
|
}
|
|
|
|
func sqrt32(x float32) float32 {
|
|
// amd64:"SQRTSS"
|
|
// 386/sse2:"SQRTSS" 386/softfloat:-"SQRTS"
|
|
// arm64:"FSQRTS"
|
|
// arm/7:"SQRTF"
|
|
// mips/hardfloat:"SQRTF" mips/softfloat:-"SQRTF"
|
|
// mips64/hardfloat:"SQRTF" mips64/softfloat:-"SQRTF"
|
|
// wasm:"F32Sqrt"
|
|
// ppc64x:"FSQRTS"
|
|
// riscv64: "FSQRTS"
|
|
return float32(math.Sqrt(float64(x)))
|
|
}
|
|
|
|
// Check that it's using integer registers
|
|
func abs(x, y float64) {
|
|
// amd64:"BTRQ\t[$]63"
|
|
// arm64:"FABSD\t"
|
|
// s390x:"LPDFR\t",-"MOVD\t" (no integer load/store)
|
|
// ppc64x:"FABS\t"
|
|
// riscv64:"FABSD\t"
|
|
// wasm:"F64Abs"
|
|
// arm/6:"ABSD\t"
|
|
// mips64/hardfloat:"ABSD\t"
|
|
// mips/hardfloat:"ABSD\t"
|
|
sink64[0] = math.Abs(x)
|
|
|
|
// amd64:"BTRQ\t[$]63","PXOR" (TODO: this should be BTSQ)
|
|
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
|
|
// ppc64x:"FNABS\t"
|
|
sink64[1] = -math.Abs(y)
|
|
}
|
|
|
|
// Check that it's using integer registers
|
|
func abs32(x float32) float32 {
|
|
// s390x:"LPDFR",-"LDEBR",-"LEDBR" (no float64 conversion)
|
|
return float32(math.Abs(float64(x)))
|
|
}
|
|
|
|
// Check that it's using integer registers
|
|
func copysign(a, b, c float64) {
|
|
// amd64:"BTRQ\t[$]63","ANDQ","ORQ"
|
|
// s390x:"CPSDR",-"MOVD" (no integer load/store)
|
|
// ppc64x:"FCPSGN"
|
|
// riscv64:"FSGNJD"
|
|
// wasm:"F64Copysign"
|
|
sink64[0] = math.Copysign(a, b)
|
|
|
|
// amd64:"BTSQ\t[$]63"
|
|
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
|
|
// ppc64x:"FCPSGN"
|
|
// riscv64:"FSGNJD"
|
|
// arm64:"ORR", -"AND"
|
|
sink64[1] = math.Copysign(c, -1)
|
|
|
|
// Like math.Copysign(c, -1), but with integer operations. Useful
|
|
// for platforms that have a copysign opcode to see if it's detected.
|
|
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
|
|
sink64[2] = math.Float64frombits(math.Float64bits(a) | 1<<63)
|
|
|
|
// amd64:"ANDQ","ORQ"
|
|
// s390x:"CPSDR\t",-"MOVD\t" (no integer load/store)
|
|
// ppc64x:"FCPSGN"
|
|
// riscv64:"FSGNJD"
|
|
sink64[3] = math.Copysign(-1, c)
|
|
}
|
|
|
|
func fma(x, y, z float64) float64 {
|
|
// amd64/v3:-".*x86HasFMA"
|
|
// amd64:"VFMADD231SD"
|
|
// arm/6:"FMULAD"
|
|
// arm64:"FMADDD"
|
|
// loong64:"FMADDD"
|
|
// s390x:"FMADD"
|
|
// ppc64x:"FMADD"
|
|
// riscv64:"FMADDD"
|
|
return math.FMA(x, y, z)
|
|
}
|
|
|
|
func fms(x, y, z float64) float64 {
|
|
// riscv64:"FMSUBD"
|
|
return math.FMA(x, y, -z)
|
|
}
|
|
|
|
func fnms(x, y, z float64) float64 {
|
|
// riscv64:"FNMSUBD",-"FNMADDD"
|
|
return math.FMA(-x, y, z)
|
|
}
|
|
|
|
func fnma(x, y, z float64) float64 {
|
|
// riscv64:"FNMADDD",-"FNMSUBD"
|
|
return math.FMA(x, -y, -z)
|
|
}
|
|
|
|
func fromFloat64(f64 float64) uint64 {
|
|
// amd64:"MOVQ\tX.*, [^X].*"
|
|
// arm64:"FMOVD\tF.*, R.*"
|
|
// loong64:"MOVV\tF.*, R.*"
|
|
// ppc64x:"MFVSRD"
|
|
// mips64/hardfloat:"MOVV\tF.*, R.*"
|
|
return math.Float64bits(f64+1) + 1
|
|
}
|
|
|
|
func fromFloat32(f32 float32) uint32 {
|
|
// amd64:"MOVL\tX.*, [^X].*"
|
|
// arm64:"FMOVS\tF.*, R.*"
|
|
// loong64:"MOVW\tF.*, R.*"
|
|
// mips64/hardfloat:"MOVW\tF.*, R.*"
|
|
return math.Float32bits(f32+1) + 1
|
|
}
|
|
|
|
func toFloat64(u64 uint64) float64 {
|
|
// amd64:"MOVQ\t[^X].*, X.*"
|
|
// arm64:"FMOVD\tR.*, F.*"
|
|
// loong64:"MOVV\tR.*, F.*"
|
|
// ppc64x:"MTVSRD"
|
|
// mips64/hardfloat:"MOVV\tR.*, F.*"
|
|
return math.Float64frombits(u64+1) + 1
|
|
}
|
|
|
|
func toFloat32(u32 uint32) float32 {
|
|
// amd64:"MOVL\t[^X].*, X.*"
|
|
// arm64:"FMOVS\tR.*, F.*"
|
|
// loong64:"MOVW\tR.*, F.*"
|
|
// mips64/hardfloat:"MOVW\tR.*, F.*"
|
|
return math.Float32frombits(u32+1) + 1
|
|
}
|
|
|
|
// Test that comparisons with constants converted to float
|
|
// are evaluated at compile-time
|
|
|
|
func constantCheck64() bool {
|
|
// amd64:"(MOVB\t[$]0)|(XORL\t[A-Z][A-Z0-9]+, [A-Z][A-Z0-9]+)",-"FCMP",-"MOVB\t[$]1"
|
|
// s390x:"MOV(B|BZ|D)\t[$]0,",-"FCMPU",-"MOV(B|BZ|D)\t[$]1,"
|
|
return 0.5 == float64(uint32(1)) || 1.5 > float64(uint64(1<<63))
|
|
}
|
|
|
|
func constantCheck32() bool {
|
|
// amd64:"MOV(B|L)\t[$]1",-"FCMP",-"MOV(B|L)\t[$]0"
|
|
// s390x:"MOV(B|BZ|D)\t[$]1,",-"FCMPU",-"MOV(B|BZ|D)\t[$]0,"
|
|
return float32(0.5) <= float32(int64(1)) && float32(1.5) >= float32(int32(-1<<31))
|
|
}
|
|
|
|
// Test that integer constants are converted to floating point constants
|
|
// at compile-time
|
|
|
|
func constantConvert32(x float32) float32 {
|
|
// amd64:"MOVSS\t[$]f32.3f800000\\(SB\\)"
|
|
// s390x:"FMOVS\t[$]f32.3f800000\\(SB\\)"
|
|
// ppc64x/power8:"FMOVS\t[$]f32.3f800000\\(SB\\)"
|
|
// ppc64x/power9:"FMOVS\t[$]f32.3f800000\\(SB\\)"
|
|
// ppc64x/power10:"XXSPLTIDP\t[$]1065353216, VS0"
|
|
// arm64:"FMOVS\t[$]\\(1.0\\)"
|
|
if x > math.Float32frombits(0x3f800000) {
|
|
return -x
|
|
}
|
|
return x
|
|
}
|
|
|
|
func constantConvertInt32(x uint32) uint32 {
|
|
// amd64:-"MOVSS"
|
|
// s390x:-"FMOVS"
|
|
// ppc64x:-"FMOVS"
|
|
// arm64:-"FMOVS"
|
|
if x > math.Float32bits(1) {
|
|
return -x
|
|
}
|
|
return x
|
|
}
|
|
|
|
func nanGenerate64() float64 {
|
|
// Test to make sure we don't generate a NaN while constant propagating.
|
|
// See issue 36400.
|
|
zero := 0.0
|
|
// amd64:-"DIVSD"
|
|
inf := 1 / zero // +inf. We can constant propagate this one.
|
|
negone := -1.0
|
|
|
|
// amd64:"DIVSD"
|
|
z0 := zero / zero
|
|
// amd64/v1,amd64/v2:"MULSD"
|
|
z1 := zero * inf
|
|
// amd64:"SQRTSD"
|
|
z2 := math.Sqrt(negone)
|
|
// amd64/v3:"VFMADD231SD"
|
|
return z0 + z1 + z2
|
|
}
|
|
|
|
func nanGenerate32() float32 {
|
|
zero := float32(0.0)
|
|
// amd64:-"DIVSS"
|
|
inf := 1 / zero // +inf. We can constant propagate this one.
|
|
|
|
// amd64:"DIVSS"
|
|
z0 := zero / zero
|
|
// amd64/v1,amd64/v2:"MULSS"
|
|
z1 := zero * inf
|
|
// amd64/v3:"VFMADD231SS"
|
|
return z0 + z1
|
|
}
|