diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go index 3f46599a3e..634a6f6864 100644 --- a/src/cmd/compile/internal/ssa/compile.go +++ b/src/cmd/compile/internal/ssa/compile.go @@ -488,6 +488,7 @@ var passes = [...]pass{ {name: "lower", fn: lower, required: true}, {name: "addressing modes", fn: addressingModes, required: false}, {name: "late lower", fn: lateLower, required: true}, + {name: "pair", fn: pair}, {name: "lowered deadcode for cse", fn: deadcode}, // deadcode immediately before CSE avoids CSE making dead values live again {name: "lowered cse", fn: cse}, {name: "elim unread autos", fn: elimUnreadAutos}, diff --git a/src/cmd/compile/internal/ssa/pair.go b/src/cmd/compile/internal/ssa/pair.go new file mode 100644 index 0000000000..5af9b0cb1b --- /dev/null +++ b/src/cmd/compile/internal/ssa/pair.go @@ -0,0 +1,357 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "cmd/compile/internal/ir" + "cmd/compile/internal/types" + "slices" +) + +// The pair pass finds memory operations that can be paired up +// into single 2-register memory instructions. +func pair(f *Func) { + // Only arm64 for now. This pass is fairly arch-specific. + switch f.Config.arch { + case "arm64": + default: + return + } + pairLoads(f) + pairStores(f) +} + +type pairableLoadInfo struct { + width int64 // width of one element in the pair, in bytes + pair Op +} + +// All pairableLoad ops must take 2 arguments, a pointer and a memory. +// They must also take an offset in Aux/AuxInt. +var pairableLoads = map[Op]pairableLoadInfo{ + OpARM64MOVDload: {8, OpARM64LDP}, + OpARM64MOVWload: {4, OpARM64LDPW}, + OpARM64FMOVDload: {8, OpARM64FLDPD}, + OpARM64FMOVSload: {4, OpARM64FLDPS}, +} + +type pairableStoreInfo struct { + width int64 // width of one element in the pair, in bytes + pair Op +} + +// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory. +// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory. +// They must also take an offset in Aux/AuxInt. +var pairableStores = map[Op]pairableStoreInfo{ + OpARM64MOVDstore: {8, OpARM64STP}, + OpARM64MOVWstore: {4, OpARM64STPW}, + OpARM64FMOVDstore: {8, OpARM64FSTPD}, + OpARM64FMOVSstore: {4, OpARM64FSTPS}, + // TODO: storezero variants. +} + +// offsetOk returns true if a pair instruction should be used +// for the offset Aux+off, when the data width (of the +// unpaired instructions) is width. +// This function is best-effort. The compiled function must +// still work if offsetOk always returns true. +// TODO: this is currently arm64-specific. +func offsetOk(aux Aux, off, width int64) bool { + if true { + // Seems to generate slightly smaller code if we just + // always allow this rewrite. + // + // Without pairing, we have 2 load instructions, like: + // LDR 88(R0), R1 + // LDR 96(R0), R2 + // with pairing we have, best case: + // LDP 88(R0), R1, R2 + // but maybe we need an adjuster if out of range or unaligned: + // ADD R0, $88, R27 + // LDP (R27), R1, R2 + // Even with the adjuster, it is at least no worse. + // + // A similar situation occurs when accessing globals. + // Two loads from globals requires 4 instructions, + // two ADRP and two LDR. With pairing, we need + // ADRP+ADD+LDP, three instructions. + // + // With pairing, it looks like the critical path might + // be a little bit longer. But it should never be more + // instructions. + // TODO: see if that longer critical path causes any + // regressions. + return true + } + if aux != nil { + if _, ok := aux.(*ir.Name); !ok { + // Offset is probably too big (globals). + return false + } + // We let *ir.Names pass here, as + // they are probably small offsets from SP. + // There's no guarantee that we're in range + // in that case though (we don't know the + // stack frame size yet), so the assembler + // might need to issue fixup instructions. + // Assume some small frame size. + if off >= 0 { + off += 120 + } + // TODO: figure out how often this helps vs. hurts. + } + switch width { + case 4: + if off >= -256 && off <= 252 && off%4 == 0 { + return true + } + case 8: + if off >= -512 && off <= 504 && off%8 == 0 { + return true + } + } + return false +} + +func pairLoads(f *Func) { + var loads []*Value + + // Registry of aux values for sorting. + auxIDs := map[Aux]int{} + auxID := func(aux Aux) int { + id, ok := auxIDs[aux] + if !ok { + id = len(auxIDs) + auxIDs[aux] = id + } + return id + } + + for _, b := range f.Blocks { + // Find loads. + loads = loads[:0] + clear(auxIDs) + for _, v := range b.Values { + info := pairableLoads[v.Op] + if info.width == 0 { + continue // not pairable + } + if !offsetOk(v.Aux, v.AuxInt, info.width) { + continue // not advisable + } + loads = append(loads, v) + } + if len(loads) < 2 { + continue + } + + // Sort to put pairable loads together. + slices.SortFunc(loads, func(x, y *Value) int { + // First sort by op, ptr, and memory arg. + if x.Op != y.Op { + return int(x.Op - y.Op) + } + if x.Args[0].ID != y.Args[0].ID { + return int(x.Args[0].ID - y.Args[0].ID) + } + if x.Args[1].ID != y.Args[1].ID { + return int(x.Args[1].ID - y.Args[1].ID) + } + // Then sort by aux. (nil first, then by aux ID) + if x.Aux != nil { + if y.Aux == nil { + return 1 + } + a, b := auxID(x.Aux), auxID(y.Aux) + if a != b { + return a - b + } + } else if y.Aux != nil { + return -1 + } + // Then sort by offset, low to high. + return int(x.AuxInt - y.AuxInt) + }) + + // Look for pairable loads. + for i := 0; i < len(loads)-1; i++ { + x := loads[i] + y := loads[i+1] + if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] { + continue + } + if x.Aux != y.Aux { + continue + } + if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt { + continue + } + + // Commit point. + + // Make the 2-register load. + load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1]) + + // Modify x to be (Select0 load). Similar for y. + x.reset(OpSelect0) + x.SetArgs1(load) + y.reset(OpSelect1) + y.SetArgs1(load) + + i++ // Skip y next time around the loop. + } + } +} + +func pairStores(f *Func) { + last := f.Cache.allocBoolSlice(f.NumValues()) + defer f.Cache.freeBoolSlice(last) + + // prevStore returns the previous store in the + // same block, or nil if there are none. + prevStore := func(v *Value) *Value { + if v.Op == OpInitMem || v.Op == OpPhi { + return nil + } + m := v.MemoryArg() + if m.Block != v.Block { + return nil + } + return m + } + + for _, b := range f.Blocks { + // Find last store in block, so we can + // walk the stores last to first. + // Last to first helps ensure that the rewrites we + // perform do not get in the way of subsequent rewrites. + for _, v := range b.Values { + if v.Type.IsMemory() { + last[v.ID] = true + } + } + for _, v := range b.Values { + if v.Type.IsMemory() { + if m := prevStore(v); m != nil { + last[m.ID] = false + } + } + } + var lastMem *Value + for _, v := range b.Values { + if last[v.ID] { + lastMem = v + break + } + } + + // Check all stores, from last to first. + memCheck: + for v := lastMem; v != nil; v = prevStore(v) { + info := pairableStores[v.Op] + if info.width == 0 { + continue // Not pairable. + } + if !offsetOk(v.Aux, v.AuxInt, info.width) { + continue // Not advisable to pair. + } + ptr := v.Args[0] + val := v.Args[1] + mem := v.Args[2] + off := v.AuxInt + aux := v.Aux + + // Look for earlier store we can combine with. + lowerOk := true + higherOk := true + count := 10 // max lookback distance + for w := prevStore(v); w != nil; w = prevStore(w) { + if w.Uses != 1 { + // We can't combine stores if the earlier + // store has any use besides the next one + // in the store chain. + // (Unless we could check the aliasing of + // all those other uses.) + continue memCheck + } + if w.Op == v.Op && + w.Args[0] == ptr && + w.Aux == aux && + (lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) { + // This op is mergeable with v. + + // Commit point. + + // ptr val1 val2 mem + args := []*Value{ptr, val, w.Args[1], mem} + if w.AuxInt == off-info.width { + args[1], args[2] = args[2], args[1] + off -= info.width + } + v.reset(info.pair) + v.AddArgs(args...) + v.Aux = aux + v.AuxInt = off + v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?) + + // Make w just a memory copy. + wmem := w.MemoryArg() + w.reset(OpCopy) + w.SetArgs1(wmem) + continue memCheck + } + if count--; count == 0 { + // Only look back so far. + // This keeps us in O(n) territory, and it + // also prevents us from keeping values + // in registers for too long (and thus + // needing to spill them). + continue memCheck + } + // We're now looking at a store w which is currently + // between the store v that we're intending to merge into, + // and the store we'll eventually find to merge with it. + // Make sure this store doesn't alias with the one + // we'll be moving. + var width int64 + switch w.Op { + case OpARM64MOVDstore, OpARM64MOVDstorezero, OpARM64FMOVDstore: + width = 8 + case OpARM64MOVWstore, OpARM64MOVWstorezero, OpARM64FMOVSstore: + width = 4 + case OpARM64MOVHstore, OpARM64MOVHstorezero: + width = 2 + case OpARM64MOVBstore, OpARM64MOVBstorezero: + width = 1 + case OpCopy: + continue // this was a store we merged earlier + default: + // Can't reorder with any other memory operations. + // (atomics, calls, ...) + continue memCheck + } + + // We only allow reordering with respect to other + // writes to the same pointer and aux, so we can + // compute the exact the aliasing relationship. + if w.Args[0] != ptr || w.Aux != aux { + continue memCheck + } + if overlap(w.AuxInt, width, off-info.width, info.width) { + // Aliases with slot before v's location. + lowerOk = false + } + if overlap(w.AuxInt, width, off+info.width, info.width) { + // Aliases with slot after v's location. + higherOk = false + } + if !higherOk && !lowerOk { + continue memCheck + } + } + } + } +} diff --git a/test/codegen/memcombine.go b/test/codegen/memcombine.go index e1cae0e469..2a9cc68ab0 100644 --- a/test/codegen/memcombine.go +++ b/test/codegen/memcombine.go @@ -899,9 +899,11 @@ func store32le(p *struct{ a, b uint32 }, x uint64) { p.b = uint32(x >> 32) } func store32be(p *struct{ a, b uint32 }, x uint64) { + // arm64:"STPW" // ppc64:"MOVD",-"MOVW",-"SRD" // s390x:"MOVD",-"MOVW",-"SRD" p.a = uint32(x >> 32) + // arm64:-"STPW" // ppc64:-"MOVW",-"SRD" // s390x:-"MOVW",-"SRD" p.b = uint32(x) @@ -970,3 +972,95 @@ func issue70300Reverse(v uint64) (b [8]byte) { b[0] = byte(v) return b } + +// --------------------------------- // +// Arm64 double-register loads // +// --------------------------------- // + +func dwloadI64(p *struct{ a, b int64 }) int64 { + // arm64:"LDP\t" + return p.a + p.b +} +func dwloadI32(p *struct{ a, b int32 }) int32 { + // arm64:"LDPW\t" + return p.a + p.b +} +func dwloadF64(p *struct{ a, b float64 }) float64 { + // arm64:"FLDPD\t" + return p.a + p.b +} +func dwloadF32(p *struct{ a, b float32 }) float32 { + // arm64:"FLDPS\t" + return p.a + p.b +} + +func dwloadBig(p *struct{ a, b, c, d, e, f int64 }) int64 { + // arm64:"LDP\t\\(", "LDP\t16", "LDP\t32" + return p.c + p.f + p.a + p.e + p.d + p.b +} + +func dwloadArg(a [2]int64) int64 { + // arm64:"LDP\t" + return a[0] + a[1] +} + +// ---------------------------------- // +// Arm64 double-register stores // +// ---------------------------------- // + +func dwstoreI64(p *struct{ a, b int64 }, x, y int64) { + // arm64:"STP\t" + p.a = x + p.b = y +} +func dwstoreI32(p *struct{ a, b int32 }, x, y int32) { + // arm64:"STPW\t" + p.a = x + p.b = y +} +func dwstoreF64(p *struct{ a, b float64 }, x, y float64) { + // arm64:"FSTPD\t" + p.a = x + p.b = y +} +func dwstoreF32(p *struct{ a, b float32 }, x, y float32) { + // arm64:"FSTPS\t" + p.a = x + p.b = y +} + +func dwstoreBig(p *struct{ a, b, c, d, e, f int64 }, a, b, c, d, e, f int64) { + // This is not perfect. We merge b+a, then d+e, then c and f have no pair. + p.c = c + p.f = f + // arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)` + p.a = a + // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 24\(R[0-9]+\)` + p.e = e + p.d = d + p.b = b +} + +func dwstoreRet() [2]int { + // arm64:"STP\t" + return [2]int{5, 6} +} + +func dwstoreLocal(i int) int64 { + var a [2]int64 + a[0] = 5 + // arm64:"STP\t" + a[1] = 6 + return a[i] +} + +func dwstoreOrder(p *struct { + a, b int64 + c, d, e, f bool +}, a, b int64) { + // arm64:"STP\t" + p.a = a + p.c = true + p.e = true + p.b = b +} diff --git a/test/tighten.go b/test/tighten.go index 92ed2492b2..d85dfecbb0 100644 --- a/test/tighten.go +++ b/test/tighten.go @@ -9,14 +9,20 @@ package main var ( - e any - ts uint16 + ga, gb, gc, gd int ) func moveValuesWithMemoryArg(len int) { for n := 0; n < len; n++ { - // Load of e.data is lowed as a MOVDload op, which has a memory - // argument. It's moved near where it's used. - _ = e != ts // ERROR "MOVDload is moved$" "MOVDaddr is moved$" + // Loads of b and d can be delayed until inside the outer "if". + a := ga + b := gb // ERROR "MOVDload is moved$" + c := gc + d := gd // ERROR "MOVDload is moved$" + if a == c { + if b == d { + return + } + } } }