mirror of
https://github.com/golang/go.git
synced 2025-05-05 15:43:04 +00:00
cmd/compile: pair loads and stores on arm64
Look for possible paired load/store operations on arm64. I don't expect this would be a lot faster, but it will save binary space, and indirectly through the icache at least a bit of time. Change-Id: I4dd73b0e6329c4659b7453998f9b75320fcf380b Reviewed-on: https://go-review.googlesource.com/c/go/+/629256 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
parent
0b88a87879
commit
20d7c57422
@ -488,6 +488,7 @@ var passes = [...]pass{
|
||||
{name: "lower", fn: lower, required: true},
|
||||
{name: "addressing modes", fn: addressingModes, required: false},
|
||||
{name: "late lower", fn: lateLower, required: true},
|
||||
{name: "pair", fn: pair},
|
||||
{name: "lowered deadcode for cse", fn: deadcode}, // deadcode immediately before CSE avoids CSE making dead values live again
|
||||
{name: "lowered cse", fn: cse},
|
||||
{name: "elim unread autos", fn: elimUnreadAutos},
|
||||
|
357
src/cmd/compile/internal/ssa/pair.go
Normal file
357
src/cmd/compile/internal/ssa/pair.go
Normal file
@ -0,0 +1,357 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package ssa
|
||||
|
||||
import (
|
||||
"cmd/compile/internal/ir"
|
||||
"cmd/compile/internal/types"
|
||||
"slices"
|
||||
)
|
||||
|
||||
// The pair pass finds memory operations that can be paired up
|
||||
// into single 2-register memory instructions.
|
||||
func pair(f *Func) {
|
||||
// Only arm64 for now. This pass is fairly arch-specific.
|
||||
switch f.Config.arch {
|
||||
case "arm64":
|
||||
default:
|
||||
return
|
||||
}
|
||||
pairLoads(f)
|
||||
pairStores(f)
|
||||
}
|
||||
|
||||
type pairableLoadInfo struct {
|
||||
width int64 // width of one element in the pair, in bytes
|
||||
pair Op
|
||||
}
|
||||
|
||||
// All pairableLoad ops must take 2 arguments, a pointer and a memory.
|
||||
// They must also take an offset in Aux/AuxInt.
|
||||
var pairableLoads = map[Op]pairableLoadInfo{
|
||||
OpARM64MOVDload: {8, OpARM64LDP},
|
||||
OpARM64MOVWload: {4, OpARM64LDPW},
|
||||
OpARM64FMOVDload: {8, OpARM64FLDPD},
|
||||
OpARM64FMOVSload: {4, OpARM64FLDPS},
|
||||
}
|
||||
|
||||
type pairableStoreInfo struct {
|
||||
width int64 // width of one element in the pair, in bytes
|
||||
pair Op
|
||||
}
|
||||
|
||||
// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
|
||||
// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
|
||||
// They must also take an offset in Aux/AuxInt.
|
||||
var pairableStores = map[Op]pairableStoreInfo{
|
||||
OpARM64MOVDstore: {8, OpARM64STP},
|
||||
OpARM64MOVWstore: {4, OpARM64STPW},
|
||||
OpARM64FMOVDstore: {8, OpARM64FSTPD},
|
||||
OpARM64FMOVSstore: {4, OpARM64FSTPS},
|
||||
// TODO: storezero variants.
|
||||
}
|
||||
|
||||
// offsetOk returns true if a pair instruction should be used
|
||||
// for the offset Aux+off, when the data width (of the
|
||||
// unpaired instructions) is width.
|
||||
// This function is best-effort. The compiled function must
|
||||
// still work if offsetOk always returns true.
|
||||
// TODO: this is currently arm64-specific.
|
||||
func offsetOk(aux Aux, off, width int64) bool {
|
||||
if true {
|
||||
// Seems to generate slightly smaller code if we just
|
||||
// always allow this rewrite.
|
||||
//
|
||||
// Without pairing, we have 2 load instructions, like:
|
||||
// LDR 88(R0), R1
|
||||
// LDR 96(R0), R2
|
||||
// with pairing we have, best case:
|
||||
// LDP 88(R0), R1, R2
|
||||
// but maybe we need an adjuster if out of range or unaligned:
|
||||
// ADD R0, $88, R27
|
||||
// LDP (R27), R1, R2
|
||||
// Even with the adjuster, it is at least no worse.
|
||||
//
|
||||
// A similar situation occurs when accessing globals.
|
||||
// Two loads from globals requires 4 instructions,
|
||||
// two ADRP and two LDR. With pairing, we need
|
||||
// ADRP+ADD+LDP, three instructions.
|
||||
//
|
||||
// With pairing, it looks like the critical path might
|
||||
// be a little bit longer. But it should never be more
|
||||
// instructions.
|
||||
// TODO: see if that longer critical path causes any
|
||||
// regressions.
|
||||
return true
|
||||
}
|
||||
if aux != nil {
|
||||
if _, ok := aux.(*ir.Name); !ok {
|
||||
// Offset is probably too big (globals).
|
||||
return false
|
||||
}
|
||||
// We let *ir.Names pass here, as
|
||||
// they are probably small offsets from SP.
|
||||
// There's no guarantee that we're in range
|
||||
// in that case though (we don't know the
|
||||
// stack frame size yet), so the assembler
|
||||
// might need to issue fixup instructions.
|
||||
// Assume some small frame size.
|
||||
if off >= 0 {
|
||||
off += 120
|
||||
}
|
||||
// TODO: figure out how often this helps vs. hurts.
|
||||
}
|
||||
switch width {
|
||||
case 4:
|
||||
if off >= -256 && off <= 252 && off%4 == 0 {
|
||||
return true
|
||||
}
|
||||
case 8:
|
||||
if off >= -512 && off <= 504 && off%8 == 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func pairLoads(f *Func) {
|
||||
var loads []*Value
|
||||
|
||||
// Registry of aux values for sorting.
|
||||
auxIDs := map[Aux]int{}
|
||||
auxID := func(aux Aux) int {
|
||||
id, ok := auxIDs[aux]
|
||||
if !ok {
|
||||
id = len(auxIDs)
|
||||
auxIDs[aux] = id
|
||||
}
|
||||
return id
|
||||
}
|
||||
|
||||
for _, b := range f.Blocks {
|
||||
// Find loads.
|
||||
loads = loads[:0]
|
||||
clear(auxIDs)
|
||||
for _, v := range b.Values {
|
||||
info := pairableLoads[v.Op]
|
||||
if info.width == 0 {
|
||||
continue // not pairable
|
||||
}
|
||||
if !offsetOk(v.Aux, v.AuxInt, info.width) {
|
||||
continue // not advisable
|
||||
}
|
||||
loads = append(loads, v)
|
||||
}
|
||||
if len(loads) < 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Sort to put pairable loads together.
|
||||
slices.SortFunc(loads, func(x, y *Value) int {
|
||||
// First sort by op, ptr, and memory arg.
|
||||
if x.Op != y.Op {
|
||||
return int(x.Op - y.Op)
|
||||
}
|
||||
if x.Args[0].ID != y.Args[0].ID {
|
||||
return int(x.Args[0].ID - y.Args[0].ID)
|
||||
}
|
||||
if x.Args[1].ID != y.Args[1].ID {
|
||||
return int(x.Args[1].ID - y.Args[1].ID)
|
||||
}
|
||||
// Then sort by aux. (nil first, then by aux ID)
|
||||
if x.Aux != nil {
|
||||
if y.Aux == nil {
|
||||
return 1
|
||||
}
|
||||
a, b := auxID(x.Aux), auxID(y.Aux)
|
||||
if a != b {
|
||||
return a - b
|
||||
}
|
||||
} else if y.Aux != nil {
|
||||
return -1
|
||||
}
|
||||
// Then sort by offset, low to high.
|
||||
return int(x.AuxInt - y.AuxInt)
|
||||
})
|
||||
|
||||
// Look for pairable loads.
|
||||
for i := 0; i < len(loads)-1; i++ {
|
||||
x := loads[i]
|
||||
y := loads[i+1]
|
||||
if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
|
||||
continue
|
||||
}
|
||||
if x.Aux != y.Aux {
|
||||
continue
|
||||
}
|
||||
if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
|
||||
continue
|
||||
}
|
||||
|
||||
// Commit point.
|
||||
|
||||
// Make the 2-register load.
|
||||
load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
|
||||
|
||||
// Modify x to be (Select0 load). Similar for y.
|
||||
x.reset(OpSelect0)
|
||||
x.SetArgs1(load)
|
||||
y.reset(OpSelect1)
|
||||
y.SetArgs1(load)
|
||||
|
||||
i++ // Skip y next time around the loop.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func pairStores(f *Func) {
|
||||
last := f.Cache.allocBoolSlice(f.NumValues())
|
||||
defer f.Cache.freeBoolSlice(last)
|
||||
|
||||
// prevStore returns the previous store in the
|
||||
// same block, or nil if there are none.
|
||||
prevStore := func(v *Value) *Value {
|
||||
if v.Op == OpInitMem || v.Op == OpPhi {
|
||||
return nil
|
||||
}
|
||||
m := v.MemoryArg()
|
||||
if m.Block != v.Block {
|
||||
return nil
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
for _, b := range f.Blocks {
|
||||
// Find last store in block, so we can
|
||||
// walk the stores last to first.
|
||||
// Last to first helps ensure that the rewrites we
|
||||
// perform do not get in the way of subsequent rewrites.
|
||||
for _, v := range b.Values {
|
||||
if v.Type.IsMemory() {
|
||||
last[v.ID] = true
|
||||
}
|
||||
}
|
||||
for _, v := range b.Values {
|
||||
if v.Type.IsMemory() {
|
||||
if m := prevStore(v); m != nil {
|
||||
last[m.ID] = false
|
||||
}
|
||||
}
|
||||
}
|
||||
var lastMem *Value
|
||||
for _, v := range b.Values {
|
||||
if last[v.ID] {
|
||||
lastMem = v
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Check all stores, from last to first.
|
||||
memCheck:
|
||||
for v := lastMem; v != nil; v = prevStore(v) {
|
||||
info := pairableStores[v.Op]
|
||||
if info.width == 0 {
|
||||
continue // Not pairable.
|
||||
}
|
||||
if !offsetOk(v.Aux, v.AuxInt, info.width) {
|
||||
continue // Not advisable to pair.
|
||||
}
|
||||
ptr := v.Args[0]
|
||||
val := v.Args[1]
|
||||
mem := v.Args[2]
|
||||
off := v.AuxInt
|
||||
aux := v.Aux
|
||||
|
||||
// Look for earlier store we can combine with.
|
||||
lowerOk := true
|
||||
higherOk := true
|
||||
count := 10 // max lookback distance
|
||||
for w := prevStore(v); w != nil; w = prevStore(w) {
|
||||
if w.Uses != 1 {
|
||||
// We can't combine stores if the earlier
|
||||
// store has any use besides the next one
|
||||
// in the store chain.
|
||||
// (Unless we could check the aliasing of
|
||||
// all those other uses.)
|
||||
continue memCheck
|
||||
}
|
||||
if w.Op == v.Op &&
|
||||
w.Args[0] == ptr &&
|
||||
w.Aux == aux &&
|
||||
(lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) {
|
||||
// This op is mergeable with v.
|
||||
|
||||
// Commit point.
|
||||
|
||||
// ptr val1 val2 mem
|
||||
args := []*Value{ptr, val, w.Args[1], mem}
|
||||
if w.AuxInt == off-info.width {
|
||||
args[1], args[2] = args[2], args[1]
|
||||
off -= info.width
|
||||
}
|
||||
v.reset(info.pair)
|
||||
v.AddArgs(args...)
|
||||
v.Aux = aux
|
||||
v.AuxInt = off
|
||||
v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
|
||||
|
||||
// Make w just a memory copy.
|
||||
wmem := w.MemoryArg()
|
||||
w.reset(OpCopy)
|
||||
w.SetArgs1(wmem)
|
||||
continue memCheck
|
||||
}
|
||||
if count--; count == 0 {
|
||||
// Only look back so far.
|
||||
// This keeps us in O(n) territory, and it
|
||||
// also prevents us from keeping values
|
||||
// in registers for too long (and thus
|
||||
// needing to spill them).
|
||||
continue memCheck
|
||||
}
|
||||
// We're now looking at a store w which is currently
|
||||
// between the store v that we're intending to merge into,
|
||||
// and the store we'll eventually find to merge with it.
|
||||
// Make sure this store doesn't alias with the one
|
||||
// we'll be moving.
|
||||
var width int64
|
||||
switch w.Op {
|
||||
case OpARM64MOVDstore, OpARM64MOVDstorezero, OpARM64FMOVDstore:
|
||||
width = 8
|
||||
case OpARM64MOVWstore, OpARM64MOVWstorezero, OpARM64FMOVSstore:
|
||||
width = 4
|
||||
case OpARM64MOVHstore, OpARM64MOVHstorezero:
|
||||
width = 2
|
||||
case OpARM64MOVBstore, OpARM64MOVBstorezero:
|
||||
width = 1
|
||||
case OpCopy:
|
||||
continue // this was a store we merged earlier
|
||||
default:
|
||||
// Can't reorder with any other memory operations.
|
||||
// (atomics, calls, ...)
|
||||
continue memCheck
|
||||
}
|
||||
|
||||
// We only allow reordering with respect to other
|
||||
// writes to the same pointer and aux, so we can
|
||||
// compute the exact the aliasing relationship.
|
||||
if w.Args[0] != ptr || w.Aux != aux {
|
||||
continue memCheck
|
||||
}
|
||||
if overlap(w.AuxInt, width, off-info.width, info.width) {
|
||||
// Aliases with slot before v's location.
|
||||
lowerOk = false
|
||||
}
|
||||
if overlap(w.AuxInt, width, off+info.width, info.width) {
|
||||
// Aliases with slot after v's location.
|
||||
higherOk = false
|
||||
}
|
||||
if !higherOk && !lowerOk {
|
||||
continue memCheck
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -899,9 +899,11 @@ func store32le(p *struct{ a, b uint32 }, x uint64) {
|
||||
p.b = uint32(x >> 32)
|
||||
}
|
||||
func store32be(p *struct{ a, b uint32 }, x uint64) {
|
||||
// arm64:"STPW"
|
||||
// ppc64:"MOVD",-"MOVW",-"SRD"
|
||||
// s390x:"MOVD",-"MOVW",-"SRD"
|
||||
p.a = uint32(x >> 32)
|
||||
// arm64:-"STPW"
|
||||
// ppc64:-"MOVW",-"SRD"
|
||||
// s390x:-"MOVW",-"SRD"
|
||||
p.b = uint32(x)
|
||||
@ -970,3 +972,95 @@ func issue70300Reverse(v uint64) (b [8]byte) {
|
||||
b[0] = byte(v)
|
||||
return b
|
||||
}
|
||||
|
||||
// --------------------------------- //
|
||||
// Arm64 double-register loads //
|
||||
// --------------------------------- //
|
||||
|
||||
func dwloadI64(p *struct{ a, b int64 }) int64 {
|
||||
// arm64:"LDP\t"
|
||||
return p.a + p.b
|
||||
}
|
||||
func dwloadI32(p *struct{ a, b int32 }) int32 {
|
||||
// arm64:"LDPW\t"
|
||||
return p.a + p.b
|
||||
}
|
||||
func dwloadF64(p *struct{ a, b float64 }) float64 {
|
||||
// arm64:"FLDPD\t"
|
||||
return p.a + p.b
|
||||
}
|
||||
func dwloadF32(p *struct{ a, b float32 }) float32 {
|
||||
// arm64:"FLDPS\t"
|
||||
return p.a + p.b
|
||||
}
|
||||
|
||||
func dwloadBig(p *struct{ a, b, c, d, e, f int64 }) int64 {
|
||||
// arm64:"LDP\t\\(", "LDP\t16", "LDP\t32"
|
||||
return p.c + p.f + p.a + p.e + p.d + p.b
|
||||
}
|
||||
|
||||
func dwloadArg(a [2]int64) int64 {
|
||||
// arm64:"LDP\t"
|
||||
return a[0] + a[1]
|
||||
}
|
||||
|
||||
// ---------------------------------- //
|
||||
// Arm64 double-register stores //
|
||||
// ---------------------------------- //
|
||||
|
||||
func dwstoreI64(p *struct{ a, b int64 }, x, y int64) {
|
||||
// arm64:"STP\t"
|
||||
p.a = x
|
||||
p.b = y
|
||||
}
|
||||
func dwstoreI32(p *struct{ a, b int32 }, x, y int32) {
|
||||
// arm64:"STPW\t"
|
||||
p.a = x
|
||||
p.b = y
|
||||
}
|
||||
func dwstoreF64(p *struct{ a, b float64 }, x, y float64) {
|
||||
// arm64:"FSTPD\t"
|
||||
p.a = x
|
||||
p.b = y
|
||||
}
|
||||
func dwstoreF32(p *struct{ a, b float32 }, x, y float32) {
|
||||
// arm64:"FSTPS\t"
|
||||
p.a = x
|
||||
p.b = y
|
||||
}
|
||||
|
||||
func dwstoreBig(p *struct{ a, b, c, d, e, f int64 }, a, b, c, d, e, f int64) {
|
||||
// This is not perfect. We merge b+a, then d+e, then c and f have no pair.
|
||||
p.c = c
|
||||
p.f = f
|
||||
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
|
||||
p.a = a
|
||||
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), 24\(R[0-9]+\)`
|
||||
p.e = e
|
||||
p.d = d
|
||||
p.b = b
|
||||
}
|
||||
|
||||
func dwstoreRet() [2]int {
|
||||
// arm64:"STP\t"
|
||||
return [2]int{5, 6}
|
||||
}
|
||||
|
||||
func dwstoreLocal(i int) int64 {
|
||||
var a [2]int64
|
||||
a[0] = 5
|
||||
// arm64:"STP\t"
|
||||
a[1] = 6
|
||||
return a[i]
|
||||
}
|
||||
|
||||
func dwstoreOrder(p *struct {
|
||||
a, b int64
|
||||
c, d, e, f bool
|
||||
}, a, b int64) {
|
||||
// arm64:"STP\t"
|
||||
p.a = a
|
||||
p.c = true
|
||||
p.e = true
|
||||
p.b = b
|
||||
}
|
||||
|
@ -9,14 +9,20 @@
|
||||
package main
|
||||
|
||||
var (
|
||||
e any
|
||||
ts uint16
|
||||
ga, gb, gc, gd int
|
||||
)
|
||||
|
||||
func moveValuesWithMemoryArg(len int) {
|
||||
for n := 0; n < len; n++ {
|
||||
// Load of e.data is lowed as a MOVDload op, which has a memory
|
||||
// argument. It's moved near where it's used.
|
||||
_ = e != ts // ERROR "MOVDload is moved$" "MOVDaddr is moved$"
|
||||
// Loads of b and d can be delayed until inside the outer "if".
|
||||
a := ga
|
||||
b := gb // ERROR "MOVDload is moved$"
|
||||
c := gc
|
||||
d := gd // ERROR "MOVDload is moved$"
|
||||
if a == c {
|
||||
if b == d {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user