cmd/compile: match more patterns for shortcircuit

This CL tries to generalize the pattern matching of certain
shortcircuit-able CFGs a bit more:
For a shortcircuit-able CFG:
p   q
 \ /
  b
 / \
t   u
Where the constant branch is t, and b has multiple phi values
other than the control phi.

For the non-constant branch target u, we try to match the
"diamond" shape CFG:
p   q
 \ /
  b
 / \
t   u
 \ /
  m

or

p   q
 \ /
  b
  |\
  | u
  |/
  m

Instead of matching u as a single block, we know try to
generalize it as a subgraph that satisfy condition:
it's a DAG that has a single entry point u, and has a
path to m.

compilebench stats:
                         │   old.txt   │              new.txt               │
                         │   sec/op    │   sec/op     vs base               │
Template                   109.4m ± 3%   109.8m ± 3%       ~ (p=0.796 n=10)
Unicode                    94.23m ± 1%   93.85m ± 1%       ~ (p=0.631 n=10)
GoTypes                    538.2m ± 1%   538.8m ± 1%       ~ (p=0.912 n=10)
Compiler                   90.21m ± 1%   90.02m ± 1%       ~ (p=0.436 n=10)
SSA                         3.318 ± 1%    3.323 ± 1%       ~ (p=1.000 n=10)
Flate                      69.38m ± 1%   69.57m ± 2%       ~ (p=0.529 n=10)
GoParser                   128.5m ± 1%   127.4m ± 1%       ~ (p=0.075 n=10)
Reflect                    267.4m ± 1%   267.2m ± 2%       ~ (p=0.739 n=10)
Tar                        127.7m ± 2%   126.4m ± 1%       ~ (p=0.353 n=10)
XML                        149.5m ± 1%   149.6m ± 2%       ~ (p=0.684 n=10)
LinkCompiler               390.0m ± 1%   388.4m ± 2%       ~ (p=0.353 n=10)
ExternalLinkCompiler        1.296 ± 0%    1.296 ± 1%       ~ (p=0.971 n=10)
LinkWithoutDebugCompiler   226.3m ± 1%   225.5m ± 1%       ~ (p=0.393 n=10)
StdCmd                      13.26 ± 0%    13.25 ± 1%       ~ (p=0.529 n=10)
geomean                    319.3m        318.8m       -0.17%

                         │   old.txt   │               new.txt               │
                         │ user-sec/op │ user-sec/op   vs base               │
Template                   293.1m ± 3%   291.4m ± 11%       ~ (p=0.436 n=10)
Unicode                    91.09m ± 5%   87.61m ±  7%       ~ (p=0.165 n=10)
GoTypes                     1.932 ± 3%    1.926 ±  3%       ~ (p=0.739 n=10)
Compiler                   125.8m ± 3%   121.5m ± 10%       ~ (p=0.481 n=10)
SSA                         18.93 ± 3%    18.89 ±  1%       ~ (p=0.684 n=10)
Flate                      158.5m ± 5%   160.0m ±  7%       ~ (p=0.971 n=10)
GoParser                   316.0m ± 9%   327.4m ±  7%       ~ (p=0.052 n=10)
Reflect                    845.6m ± 6%   861.6m ±  3%       ~ (p=0.579 n=10)
Tar                        358.1m ± 5%   348.5m ±  4%       ~ (p=0.089 n=10)
XML                        382.4m ± 4%   392.2m ±  3%       ~ (p=0.143 n=10)
LinkCompiler               609.1m ± 4%   627.9m ±  3%       ~ (p=0.123 n=10)
ExternalLinkCompiler        1.336 ± 2%    1.343 ±  4%       ~ (p=0.565 n=10)
LinkWithoutDebugCompiler   248.7m ± 3%   248.0m ±  1%       ~ (p=0.853 n=10)
geomean                    506.4m        506.8m        +0.08%

          │   old.txt    │               new.txt               │
          │  text-bytes  │  text-bytes   vs base               │
HelloSize   965.8Ki ± 0%   965.0Ki ± 0%  -0.08% (p=0.000 n=10)
CmdGoSize   12.30Mi ± 0%   12.29Mi ± 0%  -0.08% (p=0.000 n=10)
geomean     3.406Mi        3.403Mi       -0.08%

          │   old.txt    │                new.txt                │
          │  data-bytes  │  data-bytes   vs base                 │
HelloSize   15.08Ki ± 0%   15.08Ki ± 0%       ~ (p=1.000 n=10) ¹
CmdGoSize   408.5Ki ± 0%   408.5Ki ± 0%       ~ (p=1.000 n=10) ¹
geomean     78.49Ki        78.49Ki       +0.00%
¹ all samples are equal

          │   old.txt    │                new.txt                │
          │  bss-bytes   │  bss-bytes    vs base                 │
HelloSize   142.0Ki ± 0%   142.0Ki ± 0%       ~ (p=1.000 n=10) ¹
CmdGoSize   206.4Ki ± 0%   206.4Ki ± 0%       ~ (p=1.000 n=10) ¹
geomean     171.2Ki        171.2Ki       +0.00%
¹ all samples are equal

          │   old.txt    │               new.txt               │
          │  exe-bytes   │  exe-bytes    vs base               │
HelloSize   1.466Mi ± 0%   1.462Mi ± 0%  -0.27% (p=0.000 n=10)
CmdGoSize   18.19Mi ± 0%   18.17Mi ± 0%  -0.10% (p=0.000 n=10)
geomean     5.164Mi        5.154Mi       -0.18%

Fixes #72132

Change-Id: I3d1cb10b6a158c5750adc23c79709d63dbd771f3
Reviewed-on: https://go-review.googlesource.com/c/go/+/656255
Auto-Submit: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
Junyang Shao 2025-03-09 20:14:32 +00:00 committed by Gopher Robot
parent b3aff930cf
commit 4d6722a8fd

View File

@ -135,6 +135,8 @@ func shortcircuitBlock(b *Block) bool {
// to reason about the values of phis.
return false
}
// We only process blocks with only phi values except for control
// value and its wrappers.
if len(b.Values) != nval+nOtherPhi {
return false
}
@ -293,78 +295,85 @@ func shortcircuitPhiPlan(b *Block, ctl *Value, cidx int, ti int64) func(*Value,
// In these cases, we can reconstruct what the value
// of any phi in b must be in the successor blocks.
if len(t.Preds) == 1 && len(t.Succs) == 1 &&
len(u.Preds) == 1 && len(u.Succs) == 1 &&
t.Succs[0].b == u.Succs[0].b && len(t.Succs[0].b.Preds) == 2 {
// p q
// \ /
// b
// / \
// t u
// \ /
// m
//
// After the CFG modifications, this will look like
//
// p q
// | /
// | b
// |/ \
// t u
// \ /
// m
//
// NB: t.Preds is (b, p), not (p, b).
if len(t.Preds) == 1 && len(t.Succs) == 1 && len(u.Preds) == 1 &&
len(t.Succs[0].b.Preds) == 2 {
m := t.Succs[0].b
return func(v *Value, i int) {
// Replace any uses of v in t and u with the value v must have,
// given that we have arrived at that block.
// Then move v to m and adjust its value accordingly;
// this handles all other uses of v.
argP, argQ := v.Args[cidx], v.Args[1^cidx]
u.replaceUses(v, argQ)
phi := t.Func.newValue(OpPhi, v.Type, t, v.Pos)
phi.AddArg2(argQ, argP)
t.replaceUses(v, phi)
if v.Uses == 0 {
return
}
v.moveTo(m, i)
// The phi in m belongs to whichever pred idx corresponds to t.
if m.Preds[0].b == t {
v.SetArgs2(phi, argQ)
} else {
v.SetArgs2(argQ, phi)
if visited := u.flowsTo(m, 5); visited != nil {
// p q
// \ /
// b
// / \
// t U (sub graph that satisfy condition in flowsTo)
// \ /
// m
//
// After the CFG modifications, this will look like
//
// p q
// | /
// | b
// |/ \
// t U
// \ /
// m
//
// NB: t.Preds is (b, p), not (p, b).
return func(v *Value, i int) {
// Replace any uses of v in t and u with the value v must have,
// given that we have arrived at that block.
// Then move v to m and adjust its value accordingly;
// this handles all other uses of v.
argP, argQ := v.Args[cidx], v.Args[1^cidx]
phi := t.Func.newValue(OpPhi, v.Type, t, v.Pos)
phi.AddArg2(argQ, argP)
t.replaceUses(v, phi)
for bb := range visited {
bb.replaceUses(v, argQ)
}
if v.Uses == 0 {
return
}
v.moveTo(m, i)
// The phi in m belongs to whichever pred idx corresponds to t.
if m.Preds[0].b == t {
v.SetArgs2(phi, argQ)
} else {
v.SetArgs2(argQ, phi)
}
}
}
}
if len(t.Preds) == 2 && len(u.Preds) == 1 && len(u.Succs) == 1 && u.Succs[0].b == t {
// p q
// \ /
// b
// |\
// | u
// |/
// t
//
// After the CFG modifications, this will look like
//
// q
// /
// b
// |\
// p | u
// \|/
// t
//
// NB: t.Preds is (b or u, b or u, p).
return func(v *Value, i int) {
// Replace any uses of v in u. Then move v to t.
argP, argQ := v.Args[cidx], v.Args[1^cidx]
u.replaceUses(v, argQ)
v.moveTo(t, i)
v.SetArgs3(argQ, argQ, argP)
if len(t.Preds) == 2 && len(u.Preds) == 1 {
if visited := u.flowsTo(t, 5); visited != nil {
// p q
// \ /
// b
// |\
// | U ((sub graph that satisfy condition in flowsTo))
// |/
// t
//
// After the CFG modifications, this will look like
//
// q
// /
// b
// |\
// p | U
// \|/
// t
//
// NB: t.Preds is (b or U, b or U, p).
return func(v *Value, i int) {
// Replace any uses of v in U. Then move v to t.
argP, argQ := v.Args[cidx], v.Args[1^cidx]
for bb := range visited {
bb.replaceUses(v, argQ)
}
v.moveTo(t, i)
v.SetArgs3(argQ, argQ, argP)
}
}
}
@ -511,3 +520,77 @@ func (v *Value) moveTo(dst *Block, i int) {
src.Values[last] = nil
src.Values = src.Values[:last]
}
// flowsTo checks that the subgraph starting from v and ends at t is a DAG, with
// the following constraints:
//
// (1) v can reach t.
// (2) v's connected component removing the paths containing t is a DAG.
// (3) The blocks in the subgraph G defined in (2) has all their preds also in G,
// except v.
// (4) The subgraph defined in (2) has a size smaller than cap.
//
// We know that the subgraph G defined in constraint (2)(3) has the property that v
// dominates all the blocks in G:
// If there exist a block x in G that is not dominated by v, then there exist a
// path P from entry to x that does not contain v. Denote x's predecessor in P
// as x', then x' must also be in G given constraint (3), same to its pred x''
// in P. Given constraint (2), by going back in P we will in the end reach v,
// which conflicts with the definition of P.
//
// Constraint (2)'s DAG requirement could be further relaxed to contain "internal"
// loops that doesn't change the dominance relation of v. But that is more subtle
// and requires another constraint on the source block v, and a more complex proof.
// Furthermore optimizing the branch guarding a loop might bring less gains as the
// loop itself might be the bottleneck.
func (v *Block) flowsTo(t *Block, cap int) map[*Block]struct{} {
seen := map[*Block]struct{}{}
var boundedDFS func(b *Block)
hasPathToT := false
fullyExplored := true
isDAG := true
visited := map[*Block]struct{}{}
boundedDFS = func(b *Block) {
if _, ok := seen[b]; ok {
return
}
if _, ok := visited[b]; ok {
isDAG = false
return
}
if b == t {
// do not put t into seen, this way
// if v can reach t's connected component without going through t,
// it will fail the pred check after boundedDFSUntil.
hasPathToT = true
return
}
if len(seen) > cap {
fullyExplored = false
return
}
seen[b] = struct{}{}
visited[b] = struct{}{}
for _, se := range b.Succs {
boundedDFS(se.b)
if !(isDAG && fullyExplored) {
return
}
}
delete(visited, b)
}
boundedDFS(v)
if hasPathToT && fullyExplored && isDAG {
for b := range seen {
if b != v {
for _, se := range b.Preds {
if _, ok := seen[se.b]; !ok {
return nil
}
}
}
}
return seen
}
return nil
}