go/test/codegen/math.go
Jakub Ciolek 43b7e67040 cmd/compile: lower x*z + y to FMA if FMA enabled
There is a generic opcode for FMA, but we don't use it in rewrite rules.
This is maybe because some archs, like WASM and MIPS don't have a late
lowering rule for it.

Fixes #71204

Intel Alder Lake 12600k (GOAMD64=v3):

math:

name                    old time/op  new time/op  delta
Acos-16                 4.58ns ± 0%  3.36ns ± 0%  -26.68%  (p=0.008 n=5+5)
Acosh-16                8.04ns ± 1%  6.44ns ± 0%  -19.95%  (p=0.008 n=5+5)
Asin-16                 4.28ns ± 0%  3.32ns ± 0%  -22.24%  (p=0.008 n=5+5)
Asinh-16                9.92ns ± 0%  8.62ns ± 0%  -13.13%  (p=0.008 n=5+5)
Atan-16                 2.31ns ± 0%  1.84ns ± 0%  -20.02%  (p=0.008 n=5+5)
Atanh-16                7.79ns ± 0%  7.03ns ± 0%   -9.67%  (p=0.008 n=5+5)
Atan2-16                3.93ns ± 0%  3.52ns ± 0%  -10.35%  (p=0.000 n=5+4)
Cbrt-16                 4.62ns ± 0%  4.41ns ± 0%   -4.57%  (p=0.016 n=4+5)
Ceil-16                 0.14ns ± 1%  0.14ns ± 2%     ~     (p=0.103 n=5+5)
Copysign-16             0.33ns ± 0%  0.33ns ± 0%   +0.03%  (p=0.029 n=4+4)
Cos-16                  4.87ns ± 0%  4.75ns ± 0%   -2.44%  (p=0.016 n=5+4)
Cosh-16                 4.86ns ± 0%  4.86ns ± 0%     ~     (p=0.317 n=5+5)
Erf-16                  2.71ns ± 0%  2.25ns ± 0%  -16.69%  (p=0.008 n=5+5)
Erfc-16                 3.06ns ± 0%  2.67ns ± 0%  -13.00%  (p=0.016 n=5+4)
Erfinv-16               3.88ns ± 0%  2.84ns ± 3%  -26.83%  (p=0.008 n=5+5)
Erfcinv-16              4.08ns ± 0%  3.01ns ± 1%  -26.27%  (p=0.008 n=5+5)
Exp-16                  3.29ns ± 0%  3.37ns ± 2%   +2.64%  (p=0.016 n=4+5)
ExpGo-16                8.44ns ± 0%  7.48ns ± 1%  -11.37%  (p=0.008 n=5+5)
Expm1-16                4.46ns ± 0%  3.69ns ± 2%  -17.26%  (p=0.016 n=4+5)
Exp2-16                 8.20ns ± 0%  7.39ns ± 2%   -9.94%  (p=0.008 n=5+5)
Exp2Go-16               8.26ns ± 0%  7.23ns ± 0%  -12.49%  (p=0.016 n=4+5)
Abs-16                  0.26ns ± 3%  0.22ns ± 1%  -16.34%  (p=0.008 n=5+5)
Dim-16                  0.38ns ± 1%  0.40ns ± 2%   +5.02%  (p=0.008 n=5+5)
Floor-16                0.11ns ± 1%  0.17ns ± 4%  +54.99%  (p=0.008 n=5+5)
Max-16                  1.24ns ± 0%  1.24ns ± 0%     ~     (p=0.619 n=5+5)
Min-16                  1.24ns ± 0%  1.24ns ± 0%     ~     (p=0.484 n=5+5)
Mod-16                  13.4ns ± 1%  12.8ns ± 0%   -4.21%  (p=0.016 n=5+4)
Frexp-16                1.70ns ± 0%  1.71ns ± 0%   +0.46%  (p=0.008 n=5+5)
Gamma-16                3.97ns ± 0%  3.97ns ± 0%     ~     (p=0.643 n=5+5)
Hypot-16                2.11ns ± 0%  2.11ns ± 0%     ~     (p=0.762 n=5+5)
HypotGo-16              2.48ns ± 4%  2.26ns ± 0%   -8.94%  (p=0.008 n=5+5)
Ilogb-16                1.67ns ± 0%  1.67ns ± 0%   -0.07%  (p=0.048 n=5+5)
J0-16                   19.8ns ± 0%  19.3ns ± 0%     ~     (p=0.079 n=4+5)
J1-16                   19.4ns ± 0%  18.9ns ± 0%   -2.63%  (p=0.000 n=5+4)
Jn-16                   41.5ns ± 0%  40.6ns ± 0%   -2.32%  (p=0.016 n=4+5)
Ldexp-16                2.26ns ± 0%  2.26ns ± 0%     ~     (p=0.683 n=5+5)
Lgamma-16               4.40ns ± 0%  4.21ns ± 0%   -4.21%  (p=0.008 n=5+5)
Log-16                  4.05ns ± 0%  4.05ns ± 0%     ~     (all equal)
Logb-16                 1.69ns ± 0%  1.69ns ± 0%     ~     (p=0.429 n=5+5)
Log1p-16                5.00ns ± 0%  3.99ns ± 0%  -20.14%  (p=0.008 n=5+5)
Log10-16                4.22ns ± 0%  4.21ns ± 0%   -0.15%  (p=0.008 n=5+5)
Log2-16                 2.27ns ± 0%  2.25ns ± 0%   -0.94%  (p=0.008 n=5+5)
Modf-16                 1.44ns ± 0%  1.44ns ± 0%     ~     (p=0.492 n=5+5)
Nextafter32-16          2.09ns ± 0%  2.09ns ± 0%     ~     (p=0.079 n=4+5)
Nextafter64-16          2.09ns ± 0%  2.09ns ± 0%     ~     (p=0.095 n=4+5)
PowInt-16               10.8ns ± 0%  10.8ns ± 0%     ~     (all equal)
PowFrac-16              25.3ns ± 0%  25.3ns ± 0%   -0.09%  (p=0.000 n=5+4)
Pow10Pos-16             0.52ns ± 1%  0.52ns ± 0%     ~     (p=0.810 n=5+5)
Pow10Neg-16             0.82ns ± 0%  0.82ns ± 0%     ~     (p=0.381 n=5+5)
Round-16                0.93ns ± 0%  0.93ns ± 0%     ~     (p=0.056 n=5+5)
RoundToEven-16          1.64ns ± 0%  1.64ns ± 0%     ~     (all equal)
Remainder-16            12.4ns ± 2%  12.0ns ± 0%   -3.27%  (p=0.008 n=5+5)
Signbit-16              0.37ns ± 0%  0.37ns ± 0%   -0.19%  (p=0.008 n=5+5)
Sin-16                  4.04ns ± 0%  3.92ns ± 0%   -3.13%  (p=0.000 n=4+5)
Sincos-16               5.99ns ± 0%  5.80ns ± 0%   -3.03%  (p=0.008 n=5+5)
Sinh-16                 5.22ns ± 0%  5.22ns ± 0%     ~     (p=0.651 n=5+4)
SqrtIndirect-16         0.41ns ± 0%  0.41ns ± 0%     ~     (p=0.333 n=4+5)
SqrtLatency-16          2.66ns ± 0%  2.66ns ± 0%     ~     (p=0.079 n=4+5)
SqrtIndirectLatency-16  2.66ns ± 0%  2.66ns ± 0%     ~     (p=1.000 n=5+5)
SqrtGoLatency-16        30.1ns ± 0%  28.6ns ± 1%   -4.84%  (p=0.008 n=5+5)
SqrtPrime-16             645ns ± 0%   645ns ± 0%     ~     (p=0.095 n=5+4)
Tan-16                  4.21ns ± 0%  4.09ns ± 0%   -2.76%  (p=0.029 n=4+4)
Tanh-16                 5.36ns ± 0%  5.36ns ± 0%     ~     (p=0.444 n=5+5)
Trunc-16                0.12ns ± 6%  0.11ns ± 1%   -6.79%  (p=0.008 n=5+5)
Y0-16                   19.2ns ± 0%  18.7ns ± 0%   -2.52%  (p=0.000 n=5+4)
Y1-16                   19.1ns ± 0%  18.4ns ± 0%     ~     (p=0.079 n=4+5)
Yn-16                   40.7ns ± 0%  39.5ns ± 0%   -2.82%  (p=0.008 n=5+5)
Float64bits-16          0.21ns ± 0%  0.21ns ± 0%     ~     (p=0.603 n=5+5)
Float64frombits-16      0.21ns ± 0%  0.21ns ± 0%     ~     (p=0.984 n=4+5)
Float32bits-16          0.21ns ± 0%  0.21ns ± 0%     ~     (p=0.778 n=4+5)
Float32frombits-16      0.21ns ± 0%  0.20ns ± 0%     ~     (p=0.397 n=5+5)
FMA-16                  0.82ns ± 0%  0.82ns ± 0%   +0.02%  (p=0.029 n=4+4)
[Geo mean]              2.87ns       2.74ns        -4.61%

math/cmplx:

name        old time/op  new time/op  delta
Abs-16      2.07ns ± 0%  2.05ns ± 0%   -0.70%  (p=0.016 n=5+4)
Acos-16     36.5ns ± 0%  35.7ns ± 0%   -2.33%  (p=0.029 n=4+4)
Acosh-16    37.0ns ± 0%  36.2ns ± 0%   -2.20%  (p=0.008 n=5+5)
Asin-16     36.5ns ± 0%  35.7ns ± 0%   -2.29%  (p=0.008 n=5+5)
Asinh-16    33.5ns ± 0%  31.6ns ± 0%   -5.51%  (p=0.008 n=5+5)
Atan-16     15.5ns ± 0%  13.9ns ± 0%  -10.61%  (p=0.008 n=5+5)
Atanh-16    15.0ns ± 0%  13.6ns ± 0%   -9.73%  (p=0.008 n=5+5)
Conj-16     0.11ns ± 5%  0.11ns ± 1%     ~     (p=0.421 n=5+5)
Cos-16      12.3ns ± 0%  12.2ns ± 0%   -0.60%  (p=0.000 n=4+5)
Cosh-16     12.1ns ± 0%  12.0ns ± 0%     ~     (p=0.079 n=4+5)
Exp-16      10.0ns ± 0%   9.8ns ± 0%   -1.77%  (p=0.008 n=5+5)
Log-16      14.5ns ± 0%  13.7ns ± 0%   -5.67%  (p=0.008 n=5+5)
Log10-16    14.5ns ± 0%  13.7ns ± 0%   -5.55%  (p=0.000 n=5+4)
Phase-16    5.11ns ± 0%  4.25ns ± 0%  -16.90%  (p=0.008 n=5+5)
Polar-16    7.12ns ± 0%  6.35ns ± 0%  -10.90%  (p=0.008 n=5+5)
Pow-16      64.3ns ± 0%  63.7ns ± 0%   -0.97%  (p=0.008 n=5+5)
Rect-16     5.74ns ± 0%  5.58ns ± 0%   -2.73%  (p=0.016 n=4+5)
Sin-16      12.2ns ± 0%  12.2ns ± 0%   -0.54%  (p=0.000 n=4+5)
Sinh-16     12.1ns ± 0%  12.0ns ± 0%   -0.58%  (p=0.000 n=5+4)
Sqrt-16     5.30ns ± 0%  5.18ns ± 0%   -2.36%  (p=0.008 n=5+5)
Tan-16      22.7ns ± 0%  22.6ns ± 0%   -0.33%  (p=0.008 n=5+5)
Tanh-16     21.2ns ± 0%  20.9ns ± 0%   -1.32%  (p=0.008 n=5+5)
[Geo mean]  11.3ns       10.8ns        -3.97%

Change-Id: Idcc4b357ba68477929c126289e5095b27a827b1b
Reviewed-on: https://go-review.googlesource.com/c/go/+/646335
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
2025-02-13 12:34:33 -08:00

263 lines
6.5 KiB
Go

// asmcheck
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
import "math"
var sink64 [8]float64
func approx(x float64) {
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
// amd64:"ROUNDSD\t[$]2"
// s390x:"FIDBR\t[$]6"
// arm64:"FRINTPD"
// ppc64x:"FRIP"
// wasm:"F64Ceil"
sink64[0] = math.Ceil(x)
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
// amd64:"ROUNDSD\t[$]1"
// s390x:"FIDBR\t[$]7"
// arm64:"FRINTMD"
// ppc64x:"FRIM"
// wasm:"F64Floor"
sink64[1] = math.Floor(x)
// s390x:"FIDBR\t[$]1"
// arm64:"FRINTAD"
// ppc64x:"FRIN"
sink64[2] = math.Round(x)
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
// amd64:"ROUNDSD\t[$]3"
// s390x:"FIDBR\t[$]5"
// arm64:"FRINTZD"
// ppc64x:"FRIZ"
// wasm:"F64Trunc"
sink64[3] = math.Trunc(x)
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
// amd64:"ROUNDSD\t[$]0"
// s390x:"FIDBR\t[$]4"
// arm64:"FRINTND"
// wasm:"F64Nearest"
sink64[4] = math.RoundToEven(x)
}
func sqrt(x float64) float64 {
// amd64:"SQRTSD"
// 386/sse2:"SQRTSD" 386/softfloat:-"SQRTD"
// arm64:"FSQRTD"
// arm/7:"SQRTD"
// mips/hardfloat:"SQRTD" mips/softfloat:-"SQRTD"
// mips64/hardfloat:"SQRTD" mips64/softfloat:-"SQRTD"
// wasm:"F64Sqrt"
// ppc64x:"FSQRT"
// riscv64: "FSQRTD"
return math.Sqrt(x)
}
func sqrt32(x float32) float32 {
// amd64:"SQRTSS"
// 386/sse2:"SQRTSS" 386/softfloat:-"SQRTS"
// arm64:"FSQRTS"
// arm/7:"SQRTF"
// mips/hardfloat:"SQRTF" mips/softfloat:-"SQRTF"
// mips64/hardfloat:"SQRTF" mips64/softfloat:-"SQRTF"
// wasm:"F32Sqrt"
// ppc64x:"FSQRTS"
// riscv64: "FSQRTS"
return float32(math.Sqrt(float64(x)))
}
// Check that it's using integer registers
func abs(x, y float64) {
// amd64:"BTRQ\t[$]63"
// arm64:"FABSD\t"
// s390x:"LPDFR\t",-"MOVD\t" (no integer load/store)
// ppc64x:"FABS\t"
// riscv64:"FABSD\t"
// wasm:"F64Abs"
// arm/6:"ABSD\t"
// mips64/hardfloat:"ABSD\t"
// mips/hardfloat:"ABSD\t"
sink64[0] = math.Abs(x)
// amd64:"BTRQ\t[$]63","PXOR" (TODO: this should be BTSQ)
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
// ppc64x:"FNABS\t"
sink64[1] = -math.Abs(y)
}
// Check that it's using integer registers
func abs32(x float32) float32 {
// s390x:"LPDFR",-"LDEBR",-"LEDBR" (no float64 conversion)
return float32(math.Abs(float64(x)))
}
// Check that it's using integer registers
func copysign(a, b, c float64) {
// amd64:"BTRQ\t[$]63","ANDQ","ORQ"
// s390x:"CPSDR",-"MOVD" (no integer load/store)
// ppc64x:"FCPSGN"
// riscv64:"FSGNJD"
// wasm:"F64Copysign"
sink64[0] = math.Copysign(a, b)
// amd64:"BTSQ\t[$]63"
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
// ppc64x:"FCPSGN"
// riscv64:"FSGNJD"
// arm64:"ORR", -"AND"
sink64[1] = math.Copysign(c, -1)
// Like math.Copysign(c, -1), but with integer operations. Useful
// for platforms that have a copysign opcode to see if it's detected.
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
sink64[2] = math.Float64frombits(math.Float64bits(a) | 1<<63)
// amd64:"ANDQ","ORQ"
// s390x:"CPSDR\t",-"MOVD\t" (no integer load/store)
// ppc64x:"FCPSGN"
// riscv64:"FSGNJD"
sink64[3] = math.Copysign(-1, c)
}
func fma(x, y, z float64) float64 {
// amd64/v3:-".*x86HasFMA"
// amd64:"VFMADD231SD"
// arm/6:"FMULAD"
// arm64:"FMADDD"
// loong64:"FMADDD"
// s390x:"FMADD"
// ppc64x:"FMADD"
// riscv64:"FMADDD"
return math.FMA(x, y, z)
}
func fms(x, y, z float64) float64 {
// riscv64:"FMSUBD"
return math.FMA(x, y, -z)
}
func fnms(x, y, z float64) float64 {
// riscv64:"FNMSUBD",-"FNMADDD"
return math.FMA(-x, y, z)
}
func fnma(x, y, z float64) float64 {
// riscv64:"FNMADDD",-"FNMSUBD"
return math.FMA(x, -y, -z)
}
func fromFloat64(f64 float64) uint64 {
// amd64:"MOVQ\tX.*, [^X].*"
// arm64:"FMOVD\tF.*, R.*"
// loong64:"MOVV\tF.*, R.*"
// ppc64x:"MFVSRD"
// mips64/hardfloat:"MOVV\tF.*, R.*"
return math.Float64bits(f64+1) + 1
}
func fromFloat32(f32 float32) uint32 {
// amd64:"MOVL\tX.*, [^X].*"
// arm64:"FMOVS\tF.*, R.*"
// loong64:"MOVW\tF.*, R.*"
// mips64/hardfloat:"MOVW\tF.*, R.*"
return math.Float32bits(f32+1) + 1
}
func toFloat64(u64 uint64) float64 {
// amd64:"MOVQ\t[^X].*, X.*"
// arm64:"FMOVD\tR.*, F.*"
// loong64:"MOVV\tR.*, F.*"
// ppc64x:"MTVSRD"
// mips64/hardfloat:"MOVV\tR.*, F.*"
return math.Float64frombits(u64+1) + 1
}
func toFloat32(u32 uint32) float32 {
// amd64:"MOVL\t[^X].*, X.*"
// arm64:"FMOVS\tR.*, F.*"
// loong64:"MOVW\tR.*, F.*"
// mips64/hardfloat:"MOVW\tR.*, F.*"
return math.Float32frombits(u32+1) + 1
}
// Test that comparisons with constants converted to float
// are evaluated at compile-time
func constantCheck64() bool {
// amd64:"(MOVB\t[$]0)|(XORL\t[A-Z][A-Z0-9]+, [A-Z][A-Z0-9]+)",-"FCMP",-"MOVB\t[$]1"
// s390x:"MOV(B|BZ|D)\t[$]0,",-"FCMPU",-"MOV(B|BZ|D)\t[$]1,"
return 0.5 == float64(uint32(1)) || 1.5 > float64(uint64(1<<63))
}
func constantCheck32() bool {
// amd64:"MOV(B|L)\t[$]1",-"FCMP",-"MOV(B|L)\t[$]0"
// s390x:"MOV(B|BZ|D)\t[$]1,",-"FCMPU",-"MOV(B|BZ|D)\t[$]0,"
return float32(0.5) <= float32(int64(1)) && float32(1.5) >= float32(int32(-1<<31))
}
// Test that integer constants are converted to floating point constants
// at compile-time
func constantConvert32(x float32) float32 {
// amd64:"MOVSS\t[$]f32.3f800000\\(SB\\)"
// s390x:"FMOVS\t[$]f32.3f800000\\(SB\\)"
// ppc64x/power8:"FMOVS\t[$]f32.3f800000\\(SB\\)"
// ppc64x/power9:"FMOVS\t[$]f32.3f800000\\(SB\\)"
// ppc64x/power10:"XXSPLTIDP\t[$]1065353216, VS0"
// arm64:"FMOVS\t[$]\\(1.0\\)"
if x > math.Float32frombits(0x3f800000) {
return -x
}
return x
}
func constantConvertInt32(x uint32) uint32 {
// amd64:-"MOVSS"
// s390x:-"FMOVS"
// ppc64x:-"FMOVS"
// arm64:-"FMOVS"
if x > math.Float32bits(1) {
return -x
}
return x
}
func nanGenerate64() float64 {
// Test to make sure we don't generate a NaN while constant propagating.
// See issue 36400.
zero := 0.0
// amd64:-"DIVSD"
inf := 1 / zero // +inf. We can constant propagate this one.
negone := -1.0
// amd64:"DIVSD"
z0 := zero / zero
// amd64/v1,amd64/v2:"MULSD"
z1 := zero * inf
// amd64:"SQRTSD"
z2 := math.Sqrt(negone)
// amd64/v3:"VFMADD231SD"
return z0 + z1 + z2
}
func nanGenerate32() float32 {
zero := float32(0.0)
// amd64:-"DIVSS"
inf := 1 / zero // +inf. We can constant propagate this one.
// amd64:"DIVSS"
z0 := zero / zero
// amd64/v1,amd64/v2:"MULSS"
z1 := zero * inf
// amd64/v3:"VFMADD231SS"
return z0 + z1
}