cmd/compile: pair loads and stores on arm64

Look for possible paired load/store operations on arm64.
I don't expect this would be a lot faster, but it will save
binary space, and indirectly through the icache at least a bit
of time.

Change-Id: I4dd73b0e6329c4659b7453998f9b75320fcf380b
Reviewed-on: https://go-review.googlesource.com/c/go/+/629256
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
khr@golang.org 2024-11-17 12:07:34 -08:00 committed by Keith Randall
parent 0b88a87879
commit 20d7c57422
4 changed files with 463 additions and 5 deletions

View File

@ -488,6 +488,7 @@ var passes = [...]pass{
{name: "lower", fn: lower, required: true},
{name: "addressing modes", fn: addressingModes, required: false},
{name: "late lower", fn: lateLower, required: true},
{name: "pair", fn: pair},
{name: "lowered deadcode for cse", fn: deadcode}, // deadcode immediately before CSE avoids CSE making dead values live again
{name: "lowered cse", fn: cse},
{name: "elim unread autos", fn: elimUnreadAutos},

View File

@ -0,0 +1,357 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"cmd/compile/internal/ir"
"cmd/compile/internal/types"
"slices"
)
// The pair pass finds memory operations that can be paired up
// into single 2-register memory instructions.
func pair(f *Func) {
// Only arm64 for now. This pass is fairly arch-specific.
switch f.Config.arch {
case "arm64":
default:
return
}
pairLoads(f)
pairStores(f)
}
type pairableLoadInfo struct {
width int64 // width of one element in the pair, in bytes
pair Op
}
// All pairableLoad ops must take 2 arguments, a pointer and a memory.
// They must also take an offset in Aux/AuxInt.
var pairableLoads = map[Op]pairableLoadInfo{
OpARM64MOVDload: {8, OpARM64LDP},
OpARM64MOVWload: {4, OpARM64LDPW},
OpARM64FMOVDload: {8, OpARM64FLDPD},
OpARM64FMOVSload: {4, OpARM64FLDPS},
}
type pairableStoreInfo struct {
width int64 // width of one element in the pair, in bytes
pair Op
}
// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
// They must also take an offset in Aux/AuxInt.
var pairableStores = map[Op]pairableStoreInfo{
OpARM64MOVDstore: {8, OpARM64STP},
OpARM64MOVWstore: {4, OpARM64STPW},
OpARM64FMOVDstore: {8, OpARM64FSTPD},
OpARM64FMOVSstore: {4, OpARM64FSTPS},
// TODO: storezero variants.
}
// offsetOk returns true if a pair instruction should be used
// for the offset Aux+off, when the data width (of the
// unpaired instructions) is width.
// This function is best-effort. The compiled function must
// still work if offsetOk always returns true.
// TODO: this is currently arm64-specific.
func offsetOk(aux Aux, off, width int64) bool {
if true {
// Seems to generate slightly smaller code if we just
// always allow this rewrite.
//
// Without pairing, we have 2 load instructions, like:
// LDR 88(R0), R1
// LDR 96(R0), R2
// with pairing we have, best case:
// LDP 88(R0), R1, R2
// but maybe we need an adjuster if out of range or unaligned:
// ADD R0, $88, R27
// LDP (R27), R1, R2
// Even with the adjuster, it is at least no worse.
//
// A similar situation occurs when accessing globals.
// Two loads from globals requires 4 instructions,
// two ADRP and two LDR. With pairing, we need
// ADRP+ADD+LDP, three instructions.
//
// With pairing, it looks like the critical path might
// be a little bit longer. But it should never be more
// instructions.
// TODO: see if that longer critical path causes any
// regressions.
return true
}
if aux != nil {
if _, ok := aux.(*ir.Name); !ok {
// Offset is probably too big (globals).
return false
}
// We let *ir.Names pass here, as
// they are probably small offsets from SP.
// There's no guarantee that we're in range
// in that case though (we don't know the
// stack frame size yet), so the assembler
// might need to issue fixup instructions.
// Assume some small frame size.
if off >= 0 {
off += 120
}
// TODO: figure out how often this helps vs. hurts.
}
switch width {
case 4:
if off >= -256 && off <= 252 && off%4 == 0 {
return true
}
case 8:
if off >= -512 && off <= 504 && off%8 == 0 {
return true
}
}
return false
}
func pairLoads(f *Func) {
var loads []*Value
// Registry of aux values for sorting.
auxIDs := map[Aux]int{}
auxID := func(aux Aux) int {
id, ok := auxIDs[aux]
if !ok {
id = len(auxIDs)
auxIDs[aux] = id
}
return id
}
for _, b := range f.Blocks {
// Find loads.
loads = loads[:0]
clear(auxIDs)
for _, v := range b.Values {
info := pairableLoads[v.Op]
if info.width == 0 {
continue // not pairable
}
if !offsetOk(v.Aux, v.AuxInt, info.width) {
continue // not advisable
}
loads = append(loads, v)
}
if len(loads) < 2 {
continue
}
// Sort to put pairable loads together.
slices.SortFunc(loads, func(x, y *Value) int {
// First sort by op, ptr, and memory arg.
if x.Op != y.Op {
return int(x.Op - y.Op)
}
if x.Args[0].ID != y.Args[0].ID {
return int(x.Args[0].ID - y.Args[0].ID)
}
if x.Args[1].ID != y.Args[1].ID {
return int(x.Args[1].ID - y.Args[1].ID)
}
// Then sort by aux. (nil first, then by aux ID)
if x.Aux != nil {
if y.Aux == nil {
return 1
}
a, b := auxID(x.Aux), auxID(y.Aux)
if a != b {
return a - b
}
} else if y.Aux != nil {
return -1
}
// Then sort by offset, low to high.
return int(x.AuxInt - y.AuxInt)
})
// Look for pairable loads.
for i := 0; i < len(loads)-1; i++ {
x := loads[i]
y := loads[i+1]
if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
continue
}
if x.Aux != y.Aux {
continue
}
if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
continue
}
// Commit point.
// Make the 2-register load.
load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
// Modify x to be (Select0 load). Similar for y.
x.reset(OpSelect0)
x.SetArgs1(load)
y.reset(OpSelect1)
y.SetArgs1(load)
i++ // Skip y next time around the loop.
}
}
}
func pairStores(f *Func) {
last := f.Cache.allocBoolSlice(f.NumValues())
defer f.Cache.freeBoolSlice(last)
// prevStore returns the previous store in the
// same block, or nil if there are none.
prevStore := func(v *Value) *Value {
if v.Op == OpInitMem || v.Op == OpPhi {
return nil
}
m := v.MemoryArg()
if m.Block != v.Block {
return nil
}
return m
}
for _, b := range f.Blocks {
// Find last store in block, so we can
// walk the stores last to first.
// Last to first helps ensure that the rewrites we
// perform do not get in the way of subsequent rewrites.
for _, v := range b.Values {
if v.Type.IsMemory() {
last[v.ID] = true
}
}
for _, v := range b.Values {
if v.Type.IsMemory() {
if m := prevStore(v); m != nil {
last[m.ID] = false
}
}
}
var lastMem *Value
for _, v := range b.Values {
if last[v.ID] {
lastMem = v
break
}
}
// Check all stores, from last to first.
memCheck:
for v := lastMem; v != nil; v = prevStore(v) {
info := pairableStores[v.Op]
if info.width == 0 {
continue // Not pairable.
}
if !offsetOk(v.Aux, v.AuxInt, info.width) {
continue // Not advisable to pair.
}
ptr := v.Args[0]
val := v.Args[1]
mem := v.Args[2]
off := v.AuxInt
aux := v.Aux
// Look for earlier store we can combine with.
lowerOk := true
higherOk := true
count := 10 // max lookback distance
for w := prevStore(v); w != nil; w = prevStore(w) {
if w.Uses != 1 {
// We can't combine stores if the earlier
// store has any use besides the next one
// in the store chain.
// (Unless we could check the aliasing of
// all those other uses.)
continue memCheck
}
if w.Op == v.Op &&
w.Args[0] == ptr &&
w.Aux == aux &&
(lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) {
// This op is mergeable with v.
// Commit point.
// ptr val1 val2 mem
args := []*Value{ptr, val, w.Args[1], mem}
if w.AuxInt == off-info.width {
args[1], args[2] = args[2], args[1]
off -= info.width
}
v.reset(info.pair)
v.AddArgs(args...)
v.Aux = aux
v.AuxInt = off
v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
// Make w just a memory copy.
wmem := w.MemoryArg()
w.reset(OpCopy)
w.SetArgs1(wmem)
continue memCheck
}
if count--; count == 0 {
// Only look back so far.
// This keeps us in O(n) territory, and it
// also prevents us from keeping values
// in registers for too long (and thus
// needing to spill them).
continue memCheck
}
// We're now looking at a store w which is currently
// between the store v that we're intending to merge into,
// and the store we'll eventually find to merge with it.
// Make sure this store doesn't alias with the one
// we'll be moving.
var width int64
switch w.Op {
case OpARM64MOVDstore, OpARM64MOVDstorezero, OpARM64FMOVDstore:
width = 8
case OpARM64MOVWstore, OpARM64MOVWstorezero, OpARM64FMOVSstore:
width = 4
case OpARM64MOVHstore, OpARM64MOVHstorezero:
width = 2
case OpARM64MOVBstore, OpARM64MOVBstorezero:
width = 1
case OpCopy:
continue // this was a store we merged earlier
default:
// Can't reorder with any other memory operations.
// (atomics, calls, ...)
continue memCheck
}
// We only allow reordering with respect to other
// writes to the same pointer and aux, so we can
// compute the exact the aliasing relationship.
if w.Args[0] != ptr || w.Aux != aux {
continue memCheck
}
if overlap(w.AuxInt, width, off-info.width, info.width) {
// Aliases with slot before v's location.
lowerOk = false
}
if overlap(w.AuxInt, width, off+info.width, info.width) {
// Aliases with slot after v's location.
higherOk = false
}
if !higherOk && !lowerOk {
continue memCheck
}
}
}
}
}

View File

@ -899,9 +899,11 @@ func store32le(p *struct{ a, b uint32 }, x uint64) {
p.b = uint32(x >> 32)
}
func store32be(p *struct{ a, b uint32 }, x uint64) {
// arm64:"STPW"
// ppc64:"MOVD",-"MOVW",-"SRD"
// s390x:"MOVD",-"MOVW",-"SRD"
p.a = uint32(x >> 32)
// arm64:-"STPW"
// ppc64:-"MOVW",-"SRD"
// s390x:-"MOVW",-"SRD"
p.b = uint32(x)
@ -970,3 +972,95 @@ func issue70300Reverse(v uint64) (b [8]byte) {
b[0] = byte(v)
return b
}
// --------------------------------- //
// Arm64 double-register loads //
// --------------------------------- //
func dwloadI64(p *struct{ a, b int64 }) int64 {
// arm64:"LDP\t"
return p.a + p.b
}
func dwloadI32(p *struct{ a, b int32 }) int32 {
// arm64:"LDPW\t"
return p.a + p.b
}
func dwloadF64(p *struct{ a, b float64 }) float64 {
// arm64:"FLDPD\t"
return p.a + p.b
}
func dwloadF32(p *struct{ a, b float32 }) float32 {
// arm64:"FLDPS\t"
return p.a + p.b
}
func dwloadBig(p *struct{ a, b, c, d, e, f int64 }) int64 {
// arm64:"LDP\t\\(", "LDP\t16", "LDP\t32"
return p.c + p.f + p.a + p.e + p.d + p.b
}
func dwloadArg(a [2]int64) int64 {
// arm64:"LDP\t"
return a[0] + a[1]
}
// ---------------------------------- //
// Arm64 double-register stores //
// ---------------------------------- //
func dwstoreI64(p *struct{ a, b int64 }, x, y int64) {
// arm64:"STP\t"
p.a = x
p.b = y
}
func dwstoreI32(p *struct{ a, b int32 }, x, y int32) {
// arm64:"STPW\t"
p.a = x
p.b = y
}
func dwstoreF64(p *struct{ a, b float64 }, x, y float64) {
// arm64:"FSTPD\t"
p.a = x
p.b = y
}
func dwstoreF32(p *struct{ a, b float32 }, x, y float32) {
// arm64:"FSTPS\t"
p.a = x
p.b = y
}
func dwstoreBig(p *struct{ a, b, c, d, e, f int64 }, a, b, c, d, e, f int64) {
// This is not perfect. We merge b+a, then d+e, then c and f have no pair.
p.c = c
p.f = f
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
p.a = a
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), 24\(R[0-9]+\)`
p.e = e
p.d = d
p.b = b
}
func dwstoreRet() [2]int {
// arm64:"STP\t"
return [2]int{5, 6}
}
func dwstoreLocal(i int) int64 {
var a [2]int64
a[0] = 5
// arm64:"STP\t"
a[1] = 6
return a[i]
}
func dwstoreOrder(p *struct {
a, b int64
c, d, e, f bool
}, a, b int64) {
// arm64:"STP\t"
p.a = a
p.c = true
p.e = true
p.b = b
}

View File

@ -9,14 +9,20 @@
package main
var (
e any
ts uint16
ga, gb, gc, gd int
)
func moveValuesWithMemoryArg(len int) {
for n := 0; n < len; n++ {
// Load of e.data is lowed as a MOVDload op, which has a memory
// argument. It's moved near where it's used.
_ = e != ts // ERROR "MOVDload is moved$" "MOVDaddr is moved$"
// Loads of b and d can be delayed until inside the outer "if".
a := ga
b := gb // ERROR "MOVDload is moved$"
c := gc
d := gd // ERROR "MOVDload is moved$"
if a == c {
if b == d {
return
}
}
}
}