cmd/compile: improve store-to-load forwarding with compatible types

Improve the compiler's store-to-load forwarding optimization by relaxing the
type comparison condition. Instead of requiring exact type equality (CMPeq),
we now use copyCompatibleType which allows forwarding between compatible
types where safe.

Fix several size comparison bugs in the nested store patterns. Previously,
we were comparing the size of the outer store with the load type,
rather than comparing with the size of the actual store being forwarded
from.

Skip OpConvert in dead store elimination to help get rid of dead stores such
as zeroing slices. OpConvert, like OpInlMark, doesn't really use the memory.

This optimization is particularly beneficial for code that creates slices with
computed pointers, such as the runtime's heapBitsSlice function, where
intermediate calculations were previously causing the compiler to miss
store-to-load forwarding opportunities.

Local sweet run result on an x86_64 laptop:

                       │  Orig.res   │              Hopt.res              │
                       │   sec/op    │   sec/op     vs base               │
BiogoIgor-8               5.303 ± 1%    5.322 ± 1%       ~ (p=0.190 n=10)
BiogoKrishna-8            7.894 ± 1%    7.828 ± 2%       ~ (p=0.190 n=10)
BleveIndexBatch100-8      2.257 ± 1%    2.248 ± 2%       ~ (p=0.529 n=10)
EtcdPut-8                30.12m ± 1%   30.03m ± 1%       ~ (p=0.796 n=10)
EtcdSTM-8                127.1m ± 1%   126.2m ± 0%  -0.74% (p=0.023 n=10)
GoBuildKubelet-8          52.21 ± 0%    52.05 ± 1%       ~ (p=0.063 n=10)
GoBuildKubeletLink-8      4.342 ± 1%    4.305 ± 0%  -0.85% (p=0.000 n=10)
GoBuildIstioctl-8         43.33 ± 0%    43.24 ± 0%  -0.22% (p=0.015 n=10)
GoBuildIstioctlLink-8     4.604 ± 1%    4.598 ± 0%       ~ (p=0.063 n=10)
GoBuildFrontend-8         15.33 ± 0%    15.29 ± 0%       ~ (p=0.143 n=10)
GoBuildFrontendLink-8    740.0m ± 1%   737.7m ± 1%       ~ (p=0.912 n=10)
GopherLuaKNucleotide-8    9.590 ± 1%    9.656 ± 1%       ~ (p=0.165 n=10)
MarkdownRenderXHTML-8    96.97m ± 1%   97.26m ± 2%       ~ (p=0.105 n=10)
Tile38QueryLoad-8        335.9µ ± 1%   335.6µ ± 1%       ~ (p=0.481 n=10)
geomean                   1.336         1.333       -0.22%

Change-Id: I031552623e6d5a3b1b5be8325e6314706e45534f
Reviewed-on: https://go-review.googlesource.com/c/go/+/662075
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Carlos Amedee <carlos@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Alexander Musman 2025-04-01 18:43:38 +03:00 committed by Gopher Robot
parent 822031dffc
commit 16a6b71f18
5 changed files with 68 additions and 18 deletions

View File

@ -736,26 +736,26 @@
// Load of store of same address, with compatibly typed value and same size
(Load <t1> p1 (Store {t2} p2 x _))
&& isSamePtr(p1, p2)
&& t1.Compare(x.Type) == types.CMPeq
&& copyCompatibleType(t1, x.Type)
&& t1.Size() == t2.Size()
=> x
(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 x _)))
&& isSamePtr(p1, p3)
&& t1.Compare(x.Type) == types.CMPeq
&& t1.Size() == t2.Size()
&& copyCompatibleType(t1, x.Type)
&& t1.Size() == t3.Size()
&& disjoint(p3, t3.Size(), p2, t2.Size())
=> x
(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 x _))))
&& isSamePtr(p1, p4)
&& t1.Compare(x.Type) == types.CMPeq
&& t1.Size() == t2.Size()
&& copyCompatibleType(t1, x.Type)
&& t1.Size() == t4.Size()
&& disjoint(p4, t4.Size(), p2, t2.Size())
&& disjoint(p4, t4.Size(), p3, t3.Size())
=> x
(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 _ (Store {t5} p5 x _)))))
&& isSamePtr(p1, p5)
&& t1.Compare(x.Type) == types.CMPeq
&& t1.Size() == t2.Size()
&& copyCompatibleType(t1, x.Type)
&& t1.Size() == t5.Size()
&& disjoint(p5, t5.Size(), p2, t2.Size())
&& disjoint(p5, t5.Size(), p3, t3.Size())
&& disjoint(p5, t5.Size(), p4, t4.Size())

View File

@ -55,7 +55,7 @@ func dse(f *Func) {
}
continue
}
if v.Op == OpInlMark {
if v.Op == OpInlMark || v.Op == OpConvert {
// Not really a use of the memory. See #67957.
continue
}

View File

@ -246,6 +246,19 @@ func isPtr(t *types.Type) bool {
return t.IsPtrShaped()
}
func copyCompatibleType(t1, t2 *types.Type) bool {
if t1.Size() != t2.Size() {
return false
}
if t1.IsInteger() {
return t2.IsInteger()
}
if isPtr(t1) {
return isPtr(t2)
}
return t1.Compare(t2) == types.CMPeq
}
// mergeSym merges two symbolic offsets. There is no real merging of
// offsets, we just pick the non-nil one.
func mergeSym(x, y Sym) Sym {
@ -818,12 +831,23 @@ func devirtLECall(v *Value, sym *obj.LSym) *Value {
// isSamePtr reports whether p1 and p2 point to the same address.
func isSamePtr(p1, p2 *Value) bool {
if p1 == p2 {
return true
}
if p1.Op != p2.Op {
for p1.Op == OpOffPtr && p1.AuxInt == 0 {
p1 = p1.Args[0]
}
for p2.Op == OpOffPtr && p2.AuxInt == 0 {
p2 = p2.Args[0]
}
if p1 == p2 {
return true
}
if p1.Op != p2.Op {
return false
}
}
switch p1.Op {
case OpOffPtr:
return p1.AuxInt == p2.AuxInt && isSamePtr(p1.Args[0], p2.Args[0])

View File

@ -13599,7 +13599,7 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
config := b.Func.Config
typ := &b.Func.Config.Types
// match: (Load <t1> p1 (Store {t2} p2 x _))
// cond: isSamePtr(p1, p2) && t1.Compare(x.Type) == types.CMPeq && t1.Size() == t2.Size()
// cond: isSamePtr(p1, p2) && copyCompatibleType(t1, x.Type) && t1.Size() == t2.Size()
// result: x
for {
t1 := v.Type
@ -13610,14 +13610,14 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
t2 := auxToType(v_1.Aux)
x := v_1.Args[1]
p2 := v_1.Args[0]
if !(isSamePtr(p1, p2) && t1.Compare(x.Type) == types.CMPeq && t1.Size() == t2.Size()) {
if !(isSamePtr(p1, p2) && copyCompatibleType(t1, x.Type) && t1.Size() == t2.Size()) {
break
}
v.copyOf(x)
return true
}
// match: (Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 x _)))
// cond: isSamePtr(p1, p3) && t1.Compare(x.Type) == types.CMPeq && t1.Size() == t2.Size() && disjoint(p3, t3.Size(), p2, t2.Size())
// cond: isSamePtr(p1, p3) && copyCompatibleType(t1, x.Type) && t1.Size() == t3.Size() && disjoint(p3, t3.Size(), p2, t2.Size())
// result: x
for {
t1 := v.Type
@ -13635,14 +13635,14 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
t3 := auxToType(v_1_2.Aux)
x := v_1_2.Args[1]
p3 := v_1_2.Args[0]
if !(isSamePtr(p1, p3) && t1.Compare(x.Type) == types.CMPeq && t1.Size() == t2.Size() && disjoint(p3, t3.Size(), p2, t2.Size())) {
if !(isSamePtr(p1, p3) && copyCompatibleType(t1, x.Type) && t1.Size() == t3.Size() && disjoint(p3, t3.Size(), p2, t2.Size())) {
break
}
v.copyOf(x)
return true
}
// match: (Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 x _))))
// cond: isSamePtr(p1, p4) && t1.Compare(x.Type) == types.CMPeq && t1.Size() == t2.Size() && disjoint(p4, t4.Size(), p2, t2.Size()) && disjoint(p4, t4.Size(), p3, t3.Size())
// cond: isSamePtr(p1, p4) && copyCompatibleType(t1, x.Type) && t1.Size() == t4.Size() && disjoint(p4, t4.Size(), p2, t2.Size()) && disjoint(p4, t4.Size(), p3, t3.Size())
// result: x
for {
t1 := v.Type
@ -13667,14 +13667,14 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
t4 := auxToType(v_1_2_2.Aux)
x := v_1_2_2.Args[1]
p4 := v_1_2_2.Args[0]
if !(isSamePtr(p1, p4) && t1.Compare(x.Type) == types.CMPeq && t1.Size() == t2.Size() && disjoint(p4, t4.Size(), p2, t2.Size()) && disjoint(p4, t4.Size(), p3, t3.Size())) {
if !(isSamePtr(p1, p4) && copyCompatibleType(t1, x.Type) && t1.Size() == t4.Size() && disjoint(p4, t4.Size(), p2, t2.Size()) && disjoint(p4, t4.Size(), p3, t3.Size())) {
break
}
v.copyOf(x)
return true
}
// match: (Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 _ (Store {t5} p5 x _)))))
// cond: isSamePtr(p1, p5) && t1.Compare(x.Type) == types.CMPeq && t1.Size() == t2.Size() && disjoint(p5, t5.Size(), p2, t2.Size()) && disjoint(p5, t5.Size(), p3, t3.Size()) && disjoint(p5, t5.Size(), p4, t4.Size())
// cond: isSamePtr(p1, p5) && copyCompatibleType(t1, x.Type) && t1.Size() == t5.Size() && disjoint(p5, t5.Size(), p2, t2.Size()) && disjoint(p5, t5.Size(), p3, t3.Size()) && disjoint(p5, t5.Size(), p4, t4.Size())
// result: x
for {
t1 := v.Type
@ -13706,7 +13706,7 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
t5 := auxToType(v_1_2_2_2.Aux)
x := v_1_2_2_2.Args[1]
p5 := v_1_2_2_2.Args[0]
if !(isSamePtr(p1, p5) && t1.Compare(x.Type) == types.CMPeq && t1.Size() == t2.Size() && disjoint(p5, t5.Size(), p2, t2.Size()) && disjoint(p5, t5.Size(), p3, t3.Size()) && disjoint(p5, t5.Size(), p4, t4.Size())) {
if !(isSamePtr(p1, p5) && copyCompatibleType(t1, x.Type) && t1.Size() == t5.Size() && disjoint(p5, t5.Size(), p2, t2.Size()) && disjoint(p5, t5.Size(), p3, t3.Size()) && disjoint(p5, t5.Size(), p4, t4.Size())) {
break
}
v.copyOf(x)

View File

@ -6,7 +6,10 @@
package codegen
import "runtime"
import (
"runtime"
"unsafe"
)
// This file contains code generation tests related to the use of the
// stack.
@ -128,6 +131,29 @@ func spillSlotReuse() {
getp2()[nopInt()] = 0
}
// Check that no stack frame space is needed for simple slice initialization with underlying structure.
type mySlice struct {
array unsafe.Pointer
len int
cap int
}
// amd64:"TEXT\t.*, [$]0-"
func sliceInit(base uintptr) []uintptr {
const ptrSize = 8
size := uintptr(4096)
bitmapSize := size / ptrSize / 8
elements := int(bitmapSize / ptrSize)
var sl mySlice
sl = mySlice{
unsafe.Pointer(base + size - bitmapSize),
elements,
elements,
}
// amd64:-"POPQ",-"SP"
return *(*[]uintptr)(unsafe.Pointer(&sl))
}
//go:noinline
func nopInt() int {
return 0