cmd/compile: delay expansion of OpArg until expand_calls

As it says, delay expanpsion of OpArg to the expand_calls phase, to enable (eventually) interprocedural SSA optimizations, and (sooner) change to a register ABI. Includes a round of cleanup to function names and comments, largely to match the expanded scope of the functions. This CL removes the per-function dependence on GOSSAHASH, but the go116lateCallExpansion kill switch remains (and was tested locally to ensure it worked). Two functions in expand_calls.go that performed overlapping things were combined into a single function that is called twice. Fixes #42236. For #40724. Change-Id: Icbb78947eaa39f17f2c1210d5c2caef20abd6571 Reviewed-on: https://go-review.googlesource.com/c/go/+/262117 Trust: David Chase <drchase@google.com> Run-TryBot: David Chase <drchase@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2025-05-18 13:54:40 +00:00 · 2020-10-13 19:24:04 -04:00 · 2020-10-13 19:24:04 -04:00 · 15f01d6ae9
commit 15f01d6ae9
parent 7fe2a84834
8 changed files with 330 additions and 125 deletions
--- a/src/cmd/compile/fmtmap_test.go
+++ b/src/cmd/compile/fmtmap_test.go
@ -136,7 +136,6 @@ var knownFormats = map[string]string{
 	"cmd/compile/internal/types.EType %s":             "",
 	"cmd/compile/internal/types.EType %v":             "",
 	"cmd/internal/obj.ABI %v":                         "",
 	"cmd/internal/src.XPos %v":                        "",
 	"error %v":                                        "",
 	"float64 %.2f":                                    "",
 	"float64 %.3f":                                    "",
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@ -409,11 +409,17 @@ func buildssa(fn *Node, worker int) *ssa.Func {
 	// Generate addresses of local declarations
 	s.decladdrs = map[*Node]*ssa.Value{}
 	var args []ssa.Param
 	var results []ssa.Param
 	for _, n := range fn.Func.Dcl {
 		switch n.Class() {
-		case PPARAM, PPARAMOUT:
+		case PPARAM:
 			s.decladdrs[n] = s.entryNewValue2A(ssa.OpLocalAddr, types.NewPtr(n.Type), n, s.sp, s.startmem)
-			if n.Class() == PPARAMOUT && s.canSSA(n) {
+			args = append(args, ssa.Param{Type: n.Type, Offset: int32(n.Xoffset)})
 		case PPARAMOUT:
 			s.decladdrs[n] = s.entryNewValue2A(ssa.OpLocalAddr, types.NewPtr(n.Type), n, s.sp, s.startmem)
 			results = append(results, ssa.Param{Type: n.Type, Offset: int32(n.Xoffset)})
 			if s.canSSA(n) {
 				// Save ssa-able PPARAMOUT variables so we can
 				// store them back to the stack at the end of
 				// the function.
@ -4909,7 +4915,7 @@ func (s *state) canSSA(n *Node) bool {
 	if n.Class() == PPARAM && n.Sym != nil && n.Sym.Name == ".this" {
 		// wrappers generated by genwrapper need to update
 		// the .this pointer in place.
-		// TODO: treat as a PPARMOUT?
+		// TODO: treat as a PPARAMOUT?
 		return false
 	}
 	return canSSAType(n.Type)
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@ -429,7 +429,7 @@ var passes = [...]pass{
 	{name: "early copyelim", fn: copyelim},
 	{name: "early deadcode", fn: deadcode}, // remove generated dead code to avoid doing pointless work during opt
 	{name: "short circuit", fn: shortcircuit},
-	{name: "decompose args", fn: decomposeArgs, required: true},
+	{name: "decompose args", fn: decomposeArgs, required: !go116lateCallExpansion, disabled: go116lateCallExpansion}, // handled by late call lowering
 	{name: "decompose user", fn: decomposeUser, required: true},
 	{name: "pre-opt deadcode", fn: deadcode},
 	{name: "opt", fn: opt, required: true},               // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@ -199,9 +199,9 @@ const (
 const go116lateCallExpansion = true
 // LateCallExpansionEnabledWithin returns true if late call expansion should be tested
-// within compilation of a function/method triggered by GOSSAHASH (defaults to "yes").
+// within compilation of a function/method.
 func LateCallExpansionEnabledWithin(f *Func) bool {
-	return go116lateCallExpansion && f.DebugTest // Currently set up for GOSSAHASH bug searches
+	return go116lateCallExpansion
 }
 // NewConfig returns a new configuration object for the given architecture.
--- a/src/cmd/compile/internal/ssa/expand_calls.go
+++ b/src/cmd/compile/internal/ssa/expand_calls.go
@ -15,7 +15,7 @@ type selKey struct {
 	from   *Value
 	offset int64
 	size   int64
-	typ    types.EType
+	typ    *types.Type
 }
 type offsetKey struct {
@ -27,7 +27,8 @@ type offsetKey struct {
 // expandCalls converts LE (Late Expansion) calls that act like they receive value args into a lower-level form
 // that is more oriented to a platform's ABI.  The SelectN operations that extract results are rewritten into
 // more appropriate forms, and any StructMake or ArrayMake inputs are decomposed until non-struct values are
-// reached.
+// reached.  On the callee side, OpArg nodes are not decomposed until this phase is run.
 // TODO results should not be lowered until this phase.
 func expandCalls(f *Func) {
 	// Calls that need lowering have some number of inputs, including a memory input,
 	// and produce a tuple of (value1, value2, ..., mem) where valueK may or may not be SSA-able.
@ -42,6 +43,10 @@ func expandCalls(f *Func) {
 	}
 	debug := f.pass.debug > 0
 	if debug {
 		fmt.Printf("\nexpandsCalls(%s)\n", f.Name)
 	}
 	canSSAType := f.fe.CanSSA
 	regSize := f.Config.RegSize
 	sp, _ := f.spSb()
@ -58,6 +63,10 @@ func expandCalls(f *Func) {
 	namedSelects := make(map[*Value][]namedVal)
 	sdom := f.Sdom()
 	common := make(map[selKey]*Value)
 	// intPairTypes returns the pair of 32-bit int types needed to encode a 64-bit integer type on a target
 	// that has no 64-bit integer registers.
 	intPairTypes := func(et types.EType) (tHi, tLo *types.Type) {
@ -107,6 +116,7 @@ func expandCalls(f *Func) {
 		return v
 	}
 	// splitSlots splits one "field" (specified by sfx, offset, and ty) out of the LocalSlots in ls and returns the new LocalSlots this generates.
 	splitSlots := func(ls []LocalSlot, sfx string, offset int64, ty *types.Type) []LocalSlot {
 		var locs []LocalSlot
 		for i := range ls {
@ -147,21 +157,103 @@ func expandCalls(f *Func) {
 	// With the current ABI, the outputs need to be converted to loads, which will all use the call's
 	// memory output as their input.
-	// rewriteSelect recursively walks leaf selector to a root (OpSelectN) through
+	// rewriteSelect recursively walks from leaf selector to a root (OpSelectN, OpLoad, OpArg)
-	// a chain of Struct/Array Select operations.  If the chain of selectors does not
+	// through a chain of Struct/Array/builtin Select operations.  If the chain of selectors does not
-	// end in OpSelectN, it does nothing (this can happen depending on compiler phase ordering).
+	// end in an expected root, it does nothing (this can happen depending on compiler phase ordering).
-	// It emits the code necessary to implement the leaf select operation that leads to the call.
+	// The "leaf" provides the type, the root supplies the container, and the leaf-to-root path
 	// accumulates the offset.
 	// It emits the code necessary to implement the leaf select operation that leads to the root.
 	//
 	// TODO when registers really arrive, must also decompose anything split across two registers or registers and memory.
 	var rewriteSelect func(leaf *Value, selector *Value, offset int64) []LocalSlot
 	rewriteSelect = func(leaf *Value, selector *Value, offset int64) []LocalSlot {
 		if debug {
 			fmt.Printf("rewriteSelect(%s, %s, %d)\n", leaf.LongString(), selector.LongString(), offset)
 		}
 		var locs []LocalSlot
 		leafType := leaf.Type
 		if len(selector.Args) > 0 {
 			w := selector.Args[0]
 			if w.Op == OpCopy {
 				for w.Op == OpCopy {
 					w = w.Args[0]
 				}
 				selector.SetArg(0, w)
 			}
 		}
 		switch selector.Op {
-		case OpSelectN:
+		case OpArg:
-			// TODO these may be duplicated. Should memoize. Intermediate selectors will go dead, no worries there.
+			if !isAlreadyExpandedAggregateType(selector.Type) {
 				if leafType == selector.Type { // OpIData leads us here, sometimes.
 					leaf.copyOf(selector)
 				} else {
 					f.Fatalf("Unexpected OpArg type, selector=%s, leaf=%s\n", selector.LongString(), leaf.LongString())
 				}
 				if debug {
 					fmt.Printf("\tOpArg, break\n")
 				}
 				break
 			}
 			if leaf.Op == OpIData {
 				leafType = removeTrivialWrapperTypes(leaf.Type)
 			}
 			aux := selector.Aux
 			auxInt := selector.AuxInt + offset
 			if leaf.Block == selector.Block {
 				leaf.reset(OpArg)
 				leaf.Aux = aux
 				leaf.AuxInt = auxInt
 				leaf.Type = leafType
 			} else {
 				w := selector.Block.NewValue0IA(leaf.Pos, OpArg, leafType, auxInt, aux)
 				leaf.copyOf(w)
 				if debug {
 					fmt.Printf("\tnew %s\n", w.LongString())
 				}
 			}
 			for _, s := range namedSelects[selector] {
 				locs = append(locs, f.Names[s.locIndex])
 			}
 		case OpLoad: // We end up here because of IData of immediate structures.
 			// Failure case:
 			// (note the failure case is very rare; w/o this case, make.bash and run.bash both pass, as well as
 			// the hard cases of building {syscall,math,math/cmplx,math/bits,go/constant} on ppc64le and mips-softfloat).
 			//
 			// GOSSAFUNC='(*dumper).dump' go build -gcflags=-l -tags=math_big_pure_go cmd/compile/internal/gc
 			// cmd/compile/internal/gc/dump.go:136:14: internal compiler error: '(*dumper).dump': not lowered: v827, StructSelect PTR PTR
 			// b2: ← b1
 			// v20 (+142) = StaticLECall <interface {},mem> {AuxCall{reflect.Value.Interface([reflect.Value,0])[interface {},24]}} [40] v8 v1
 			// v21 (142) = SelectN <mem> [1] v20
 			// v22 (142) = SelectN <interface {}> [0] v20
 			// b15: ← b8
 			// v71 (+143) = IData <Nodes> v22 (v[Nodes])
 			// v73 (+146) = StaticLECall <[]*Node,mem> {AuxCall{"".Nodes.Slice([Nodes,0])[[]*Node,8]}} [32] v71 v21
 			//
 			// translates (w/o the "case OpLoad:" above) to:
 			//
 			// b2: ← b1
 			// v20 (+142) = StaticCall <mem> {AuxCall{reflect.Value.Interface([reflect.Value,0])[interface {},24]}} [40] v715
 			// v23 (142) = Load <*uintptr> v19 v20
 			// v823 (142) = IsNonNil <bool> v23
 			// v67 (+143) = Load <*[]*Node> v880 v20
 			// b15: ← b8
 			// v827 (146) = StructSelect <*[]*Node> [0] v67
 			// v846 (146) = Store <mem> {*[]*Node} v769 v827 v20
 			// v73 (+146) = StaticCall <mem> {AuxCall{"".Nodes.Slice([Nodes,0])[[]*Node,8]}} [32] v846
 			// i.e., the struct select is generated and remains in because it is not applied to an actual structure.
 			// The OpLoad was created to load the single field of the IData
 			// This case removes that StructSelect.
 			if leafType != selector.Type {
 				f.Fatalf("Unexpected Load as selector, leaf=%s, selector=%s\n", leaf.LongString(), selector.LongString())
 			}
 			leaf.copyOf(selector)
 			for _, s := range namedSelects[selector] {
 				locs = append(locs, f.Names[s.locIndex])
 			}
 		case OpSelectN:
 			// TODO these may be duplicated. Should memoize. Intermediate selectors will go dead, no worries there.
 			call := selector.Args[0]
 			aux := call.Aux.(*AuxCall)
 			which := selector.AuxInt
@ -171,10 +263,6 @@ func expandCalls(f *Func) {
 			} else {
 				leafType := removeTrivialWrapperTypes(leaf.Type)
 				if canSSAType(leafType) {
 					for leafType.Etype == types.TSTRUCT && leafType.NumFields() == 1 {
 						// This may not be adequately general -- consider [1]etc but this is caused by immediate IDATA
 						leafType = leafType.Field(0).Type
 					}
 					pt := types.NewPtr(leafType)
 					off := offsetFrom(sp, offset+aux.OffsetOfResult(which), pt)
 					// Any selection right out of the arg area/registers has to be same Block as call, use call as mem input.
@ -185,24 +273,31 @@ func expandCalls(f *Func) {
 					} else {
 						w := call.Block.NewValue2(leaf.Pos, OpLoad, leafType, off, call)
 						leaf.copyOf(w)
 						if debug {
 							fmt.Printf("\tnew %s\n", w.LongString())
 						}
 					}
 					for _, s := range namedSelects[selector] {
 						locs = append(locs, f.Names[s.locIndex])
 					}
 				} else {
 					f.Fatalf("Should not have non-SSA-able OpSelectN, selector=%s", selector.LongString())
 				}
 			}
 		case OpStructSelect:
 			w := selector.Args[0]
 			var ls []LocalSlot
-			if w.Type.Etype != types.TSTRUCT {
+			if w.Type.Etype != types.TSTRUCT { // IData artifact
 				f.Fatalf("Bad type for w: v=%v; sel=%v; w=%v; ,f=%s\n", leaf.LongString(), selector.LongString(), w.LongString(), f.Name)
 				// Artifact of immediate interface idata
 				ls = rewriteSelect(leaf, w, offset)
 			} else {
 				ls = rewriteSelect(leaf, w, offset+w.Type.FieldOff(int(selector.AuxInt)))
 				if w.Op != OpIData {
 					for _, l := range ls {
 						locs = append(locs, f.fe.SplitStruct(l, int(selector.AuxInt)))
 					}
 				}
 			}
 		case OpArraySelect:
 			w := selector.Args[0]
@ -221,9 +316,7 @@ func expandCalls(f *Func) {
 		case OpStringPtr:
 			ls := rewriteSelect(leaf, selector.Args[0], offset)
 			locs = splitSlots(ls, ".ptr", 0, typ.BytePtr)
-			//for i := range ls {
+
 			//	locs = append(locs, f.fe.SplitSlot(&ls[i], ".ptr", 0, typ.BytePtr))
 			//}
 		case OpSlicePtr:
 			w := selector.Args[0]
 			ls := rewriteSelect(leaf, w, offset)
@ -272,32 +365,130 @@ func expandCalls(f *Func) {
 		return locs
 	}
-	// storeArg converts stores of SSA-able aggregate arguments (passed to a call) into a series of stores of
+	// storeArgOrLoad converts stores of SSA-able aggregate arguments (passed to a call) into a series of primitive-typed
-	// smaller types into individual parameter slots.
+	// stores of non-aggregate types.  It recursively walks up a chain of selectors until it reaches a Load or an Arg.
-	var storeArg func(pos src.XPos, b *Block, a *Value, t *types.Type, offset int64, mem *Value) *Value
+	// If it does not reach a Load or an Arg, nothing happens; this allows a little freedom in phase ordering.
-	storeArg = func(pos src.XPos, b *Block, a *Value, t *types.Type, offset int64, mem *Value) *Value {
+	var storeArgOrLoad func(pos src.XPos, b *Block, base, source, mem *Value, t *types.Type, offset int64) *Value
-		if debug {
+
-			fmt.Printf("\tstoreArg(%s;  %s;  %v;  %d;  %s)\n", b, a.LongString(), t, offset, mem.String())
+	// decomposeArgOrLoad is a helper for storeArgOrLoad.
 	// It decomposes a Load or an Arg into smaller parts, parameterized by the decomposeOne and decomposeTwo functions
 	// passed to it, and returns the new mem. If the type does not match one of the expected aggregate types, it returns nil instead.
 	decomposeArgOrLoad := func(pos src.XPos, b *Block, base, source, mem *Value, t *types.Type, offset int64,
 		decomposeOne func(pos src.XPos, b *Block, base, source, mem *Value, t1 *types.Type, offArg, offStore int64) *Value,
 		decomposeTwo func(pos src.XPos, b *Block, base, source, mem *Value, t1, t2 *types.Type, offArg, offStore int64) *Value) *Value {
 		u := source.Type
 		switch u.Etype {
 		case types.TARRAY:
 			elem := u.Elem()
 			for i := int64(0); i < u.NumElem(); i++ {
 				elemOff := i * elem.Size()
 				mem = decomposeOne(pos, b, base, source, mem, elem, source.AuxInt+elemOff, offset+elemOff)
 				pos = pos.WithNotStmt()
 			}
 			return mem
 		case types.TSTRUCT:
 			for i := 0; i < u.NumFields(); i++ {
 				fld := u.Field(i)
 				mem = decomposeOne(pos, b, base, source, mem, fld.Type, source.AuxInt+fld.Offset, offset+fld.Offset)
 				pos = pos.WithNotStmt()
 			}
 			return mem
 		case types.TINT64, types.TUINT64:
 			if t.Width == regSize {
 				break
 			}
 			tHi, tLo := intPairTypes(t.Etype)
 			mem = decomposeOne(pos, b, base, source, mem, tHi, source.AuxInt+hiOffset, offset+hiOffset)
 			pos = pos.WithNotStmt()
 			return decomposeOne(pos, b, base, source, mem, tLo, source.AuxInt+lowOffset, offset+lowOffset)
 		case types.TINTER:
 			return decomposeTwo(pos, b, base, source, mem, typ.Uintptr, typ.BytePtr, source.AuxInt, offset)
 		case types.TSTRING:
 			return decomposeTwo(pos, b, base, source, mem, typ.BytePtr, typ.Int, source.AuxInt, offset)
 		case types.TCOMPLEX64:
 			return decomposeTwo(pos, b, base, source, mem, typ.Float32, typ.Float32, source.AuxInt, offset)
 		case types.TCOMPLEX128:
 			return decomposeTwo(pos, b, base, source, mem, typ.Float64, typ.Float64, source.AuxInt, offset)
 		case types.TSLICE:
 			mem = decomposeTwo(pos, b, base, source, mem, typ.BytePtr, typ.Int, source.AuxInt, offset)
 			return decomposeOne(pos, b, base, source, mem, typ.Int, source.AuxInt+2*ptrSize, offset+2*ptrSize)
 		}
 		return nil
 	}
 	// storeOneArg creates a decomposed (one step) arg that is then stored.
 	// pos and b locate the store instruction, base is the base of the store target, source is the "base" of the value input,
 	// mem is the input mem, t is the type in question, and offArg and offStore are the offsets from the respective bases.
 	storeOneArg := func(pos src.XPos, b *Block, base, source, mem *Value, t *types.Type, offArg, offStore int64) *Value {
 		w := common[selKey{source, offArg, t.Width, t}]
 		if w == nil {
 			w = source.Block.NewValue0IA(source.Pos, OpArg, t, offArg, source.Aux)
 			common[selKey{source, offArg, t.Width, t}] = w
 		}
 		return storeArgOrLoad(pos, b, base, w, mem, t, offStore)
 	}
 	// storeOneLoad creates a decomposed (one step) load that is then stored.
 	storeOneLoad := func(pos src.XPos, b *Block, base, source, mem *Value, t *types.Type, offArg, offStore int64) *Value {
 		from := offsetFrom(source.Args[0], offArg, types.NewPtr(t))
 		w := source.Block.NewValue2(source.Pos, OpLoad, t, from, mem)
 		return storeArgOrLoad(pos, b, base, w, mem, t, offStore)
 	}
 	storeTwoArg := func(pos src.XPos, b *Block, base, source, mem *Value, t1, t2 *types.Type, offArg, offStore int64) *Value {
 		mem = storeOneArg(pos, b, base, source, mem, t1, offArg, offStore)
 		pos = pos.WithNotStmt()
 		t1Size := t1.Size()
 		return storeOneArg(pos, b, base, source, mem, t2, offArg+t1Size, offStore+t1Size)
 	}
 	storeTwoLoad := func(pos src.XPos, b *Block, base, source, mem *Value, t1, t2 *types.Type, offArg, offStore int64) *Value {
 		mem = storeOneLoad(pos, b, base, source, mem, t1, offArg, offStore)
 		pos = pos.WithNotStmt()
 		t1Size := t1.Size()
 		return storeOneLoad(pos, b, base, source, mem, t2, offArg+t1Size, offStore+t1Size)
 	}
 	storeArgOrLoad = func(pos src.XPos, b *Block, base, source, mem *Value, t *types.Type, offset int64) *Value {
 		if debug {
 			fmt.Printf("\tstoreArgOrLoad(%s;  %s;  %s;  %s; %d)\n", base.LongString(), source.LongString(), mem.String(), t.String(), offset)
 		}
 		switch source.Op {
 		case OpCopy:
 			return storeArgOrLoad(pos, b, base, source.Args[0], mem, t, offset)
 		case OpLoad:
 			ret := decomposeArgOrLoad(pos, b, base, source, mem, t, offset, storeOneLoad, storeTwoLoad)
 			if ret != nil {
 				return ret
 			}
 		case OpArg:
 			ret := decomposeArgOrLoad(pos, b, base, source, mem, t, offset, storeOneArg, storeTwoArg)
 			if ret != nil {
 				return ret
 			}
 		switch a.Op {
 		case OpArrayMake0, OpStructMake0:
 			return mem
 		case OpStructMake1, OpStructMake2, OpStructMake3, OpStructMake4:
 			for i := 0; i < t.NumFields(); i++ {
 				fld := t.Field(i)
-				mem = storeArg(pos, b, a.Args[i], fld.Type, offset+fld.Offset, mem)
+				mem = storeArgOrLoad(pos, b, base, source.Args[i], mem, fld.Type, offset+fld.Offset)
 				pos = pos.WithNotStmt()
 			}
 			return mem
 		case OpArrayMake1:
-			return storeArg(pos, b, a.Args[0], t.Elem(), offset, mem)
+			return storeArgOrLoad(pos, b, base, source.Args[0], mem, t.Elem(), offset)
 		case OpInt64Make:
 			tHi, tLo := intPairTypes(t.Etype)
-			mem = storeArg(pos, b, a.Args[0], tHi, offset+hiOffset, mem)
+			mem = storeArgOrLoad(pos, b, base, source.Args[0], mem, tHi, offset+hiOffset)
-			return storeArg(pos, b, a.Args[1], tLo, offset+lowOffset, mem)
+			pos = pos.WithNotStmt()
 			return storeArgOrLoad(pos, b, base, source.Args[1], mem, tLo, offset+lowOffset)
 		case OpComplexMake:
 			tPart := typ.Float32
@ -305,59 +496,45 @@ func expandCalls(f *Func) {
 			if wPart == 8 {
 				tPart = typ.Float64
 			}
-			mem = storeArg(pos, b, a.Args[0], tPart, offset, mem)
+			mem = storeArgOrLoad(pos, b, base, source.Args[0], mem, tPart, offset)
-			return storeArg(pos, b, a.Args[1], tPart, offset+wPart, mem)
+			pos = pos.WithNotStmt()
 			return storeArgOrLoad(pos, b, base, source.Args[1], mem, tPart, offset+wPart)
 		case OpIMake:
-			mem = storeArg(pos, b, a.Args[0], typ.Uintptr, offset, mem)
+			mem = storeArgOrLoad(pos, b, base, source.Args[0], mem, typ.Uintptr, offset)
-			return storeArg(pos, b, a.Args[1], typ.BytePtr, offset+ptrSize, mem)
+			pos = pos.WithNotStmt()
 			return storeArgOrLoad(pos, b, base, source.Args[1], mem, typ.BytePtr, offset+ptrSize)
 		case OpStringMake:
-			mem = storeArg(pos, b, a.Args[0], typ.BytePtr, offset, mem)
+			mem = storeArgOrLoad(pos, b, base, source.Args[0], mem, typ.BytePtr, offset)
-			return storeArg(pos, b, a.Args[1], typ.Int, offset+ptrSize, mem)
+			pos = pos.WithNotStmt()
 			return storeArgOrLoad(pos, b, base, source.Args[1], mem, typ.Int, offset+ptrSize)
 		case OpSliceMake:
-			mem = storeArg(pos, b, a.Args[0], typ.BytePtr, offset, mem)
+			mem = storeArgOrLoad(pos, b, base, source.Args[0], mem, typ.BytePtr, offset)
-			mem = storeArg(pos, b, a.Args[1], typ.Int, offset+ptrSize, mem)
+			pos = pos.WithNotStmt()
-			return storeArg(pos, b, a.Args[2], typ.Int, offset+2*ptrSize, mem)
+			mem = storeArgOrLoad(pos, b, base, source.Args[1], mem, typ.Int, offset+ptrSize)
 			return storeArgOrLoad(pos, b, base, source.Args[2], mem, typ.Int, offset+2*ptrSize)
 		}
-		dst := offsetFrom(sp, offset, types.NewPtr(t))
+		// For nodes that cannot be taken apart -- OpSelectN, other structure selectors.
 		x := b.NewValue3A(pos, OpStore, types.TypeMem, t, dst, a, mem)
 		if debug {
 			fmt.Printf("\t\tstoreArg returns %s\n", x.LongString())
 		}
 		return x
 	}
 	// splitStore converts a store of an SSA-able aggregate into a series of smaller stores, emitting
 	// appropriate Struct/Array Select operations (which will soon go dead) to obtain the parts.
 	// This has to handle aggregate types that have already been lowered by an earlier phase.
 	var splitStore func(dest, source, mem, v *Value, t *types.Type, offset int64, firstStorePos src.XPos) *Value
 	splitStore = func(dest, source, mem, v *Value, t *types.Type, offset int64, firstStorePos src.XPos) *Value {
 		if debug {
 			fmt.Printf("\tsplitStore(%s;  %s;  %s;  %s;  %v;  %d;  %v)\n", dest.LongString(), source.LongString(), mem.String(), v.LongString(), t, offset, firstStorePos)
 		}
 		pos := v.Pos.WithNotStmt()
 		switch t.Etype {
 		case types.TARRAY:
 			elt := t.Elem()
-			if t.NumElem() == 1 && t.Width == regSize && elt.Width == regSize {
+			if source.Type != t && t.NumElem() == 1 && elt.Width == t.Width && t.Width == regSize {
 				t = removeTrivialWrapperTypes(t)
-				if t.Etype == types.TSTRUCT || t.Etype == types.TARRAY {
+				// it could be a leaf type, but the "leaf" could be complex64 (for example)
-					f.Fatalf("Did not expect to find IDATA-immediate with non-trivial struct/array in it")
+				return storeArgOrLoad(pos, b, base, source, mem, t, offset)
 				}
 				break // handle the leaf type.
 			}
 			for i := int64(0); i < t.NumElem(); i++ {
 				sel := source.Block.NewValue1I(pos, OpArraySelect, elt, i, source)
-				mem = splitStore(dest, sel, mem, v, elt, offset+i*elt.Width, firstStorePos)
+				mem = storeArgOrLoad(pos, b, base, sel, mem, elt, offset+i*elt.Width)
-				firstStorePos = firstStorePos.WithNotStmt()
+				pos = pos.WithNotStmt()
 			}
 			return mem
 		case types.TSTRUCT:
-			if t.NumFields() == 1 && t.Field(0).Type.Width == t.Width && t.Width == regSize {
+			if source.Type != t && t.NumFields() == 1 && t.Field(0).Type.Width == t.Width && t.Width == regSize {
 				// This peculiar test deals with accesses to immediate interface data.
 				// It works okay because everything is the same size.
 				// Example code that triggers this can be found in go/constant/value.go, function ToComplex
@ -377,16 +554,15 @@ func expandCalls(f *Func) {
 				// v139 is later stored as an intVal == struct{val *big.Int} which naively requires the fields of
 				// of a *uint8, which does not succeed.
 				t = removeTrivialWrapperTypes(t)
 				// it could be a leaf type, but the "leaf" could be complex64 (for example)
-				return splitStore(dest, source, mem, v, t, offset, firstStorePos)
+				return storeArgOrLoad(pos, b, base, source, mem, t, offset)
 			}
 			for i := 0; i < t.NumFields(); i++ {
 				fld := t.Field(i)
 				sel := source.Block.NewValue1I(pos, OpStructSelect, fld.Type, int64(i), source)
-				mem = splitStore(dest, sel, mem, v, fld.Type, offset+fld.Offset, firstStorePos)
+				mem = storeArgOrLoad(pos, b, base, sel, mem, fld.Type, offset+fld.Offset)
-				firstStorePos = firstStorePos.WithNotStmt()
+				pos = pos.WithNotStmt()
 			}
 			return mem
@ -396,56 +572,55 @@ func expandCalls(f *Func) {
 			}
 			tHi, tLo := intPairTypes(t.Etype)
 			sel := source.Block.NewValue1(pos, OpInt64Hi, tHi, source)
-			mem = splitStore(dest, sel, mem, v, tHi, offset+hiOffset, firstStorePos)
+			mem = storeArgOrLoad(pos, b, base, sel, mem, tHi, offset+hiOffset)
-			firstStorePos = firstStorePos.WithNotStmt()
+			pos = pos.WithNotStmt()
 			sel = source.Block.NewValue1(pos, OpInt64Lo, tLo, source)
-			return splitStore(dest, sel, mem, v, tLo, offset+lowOffset, firstStorePos)
+			return storeArgOrLoad(pos, b, base, sel, mem, tLo, offset+lowOffset)
 		case types.TINTER:
 			sel := source.Block.NewValue1(pos, OpITab, typ.BytePtr, source)
-			mem = splitStore(dest, sel, mem, v, typ.BytePtr, offset, firstStorePos)
+			mem = storeArgOrLoad(pos, b, base, sel, mem, typ.BytePtr, offset)
-			firstStorePos = firstStorePos.WithNotStmt()
+			pos = pos.WithNotStmt()
 			sel = source.Block.NewValue1(pos, OpIData, typ.BytePtr, source)
-			return splitStore(dest, sel, mem, v, typ.BytePtr, offset+ptrSize, firstStorePos)
+			return storeArgOrLoad(pos, b, base, sel, mem, typ.BytePtr, offset+ptrSize)
 		case types.TSTRING:
 			sel := source.Block.NewValue1(pos, OpStringPtr, typ.BytePtr, source)
-			mem = splitStore(dest, sel, mem, v, typ.BytePtr, offset, firstStorePos)
+			mem = storeArgOrLoad(pos, b, base, sel, mem, typ.BytePtr, offset)
-			firstStorePos = firstStorePos.WithNotStmt()
+			pos = pos.WithNotStmt()
 			sel = source.Block.NewValue1(pos, OpStringLen, typ.Int, source)
-			return splitStore(dest, sel, mem, v, typ.Int, offset+ptrSize, firstStorePos)
+			return storeArgOrLoad(pos, b, base, sel, mem, typ.Int, offset+ptrSize)
 		case types.TSLICE:
 			et := types.NewPtr(t.Elem())
 			sel := source.Block.NewValue1(pos, OpSlicePtr, et, source)
-			mem = splitStore(dest, sel, mem, v, et, offset, firstStorePos)
+			mem = storeArgOrLoad(pos, b, base, sel, mem, et, offset)
-			firstStorePos = firstStorePos.WithNotStmt()
+			pos = pos.WithNotStmt()
 			sel = source.Block.NewValue1(pos, OpSliceLen, typ.Int, source)
-			mem = splitStore(dest, sel, mem, v, typ.Int, offset+ptrSize, firstStorePos)
+			mem = storeArgOrLoad(pos, b, base, sel, mem, typ.Int, offset+ptrSize)
 			sel = source.Block.NewValue1(pos, OpSliceCap, typ.Int, source)
-			return splitStore(dest, sel, mem, v, typ.Int, offset+2*ptrSize, firstStorePos)
+			return storeArgOrLoad(pos, b, base, sel, mem, typ.Int, offset+2*ptrSize)
 		case types.TCOMPLEX64:
 			sel := source.Block.NewValue1(pos, OpComplexReal, typ.Float32, source)
-			mem = splitStore(dest, sel, mem, v, typ.Float32, offset, firstStorePos)
+			mem = storeArgOrLoad(pos, b, base, sel, mem, typ.Float32, offset)
-			firstStorePos = firstStorePos.WithNotStmt()
+			pos = pos.WithNotStmt()
 			sel = source.Block.NewValue1(pos, OpComplexImag, typ.Float32, source)
-			return splitStore(dest, sel, mem, v, typ.Float32, offset+4, firstStorePos)
+			return storeArgOrLoad(pos, b, base, sel, mem, typ.Float32, offset+4)
 		case types.TCOMPLEX128:
 			sel := source.Block.NewValue1(pos, OpComplexReal, typ.Float64, source)
-			mem = splitStore(dest, sel, mem, v, typ.Float64, offset, firstStorePos)
+			mem = storeArgOrLoad(pos, b, base, sel, mem, typ.Float64, offset)
-			firstStorePos = firstStorePos.WithNotStmt()
+			pos = pos.WithNotStmt()
 			sel = source.Block.NewValue1(pos, OpComplexImag, typ.Float64, source)
-			return splitStore(dest, sel, mem, v, typ.Float64, offset+8, firstStorePos)
+			return storeArgOrLoad(pos, b, base, sel, mem, typ.Float64, offset+8)
 		}
 		// Default, including for aggregates whose single element exactly fills their container
 		// TODO this will be a problem for cast interfaces containing floats when we move to registers.
 		x := v.Block.NewValue3A(firstStorePos, OpStore, types.TypeMem, t, offsetFrom(dest, offset, types.NewPtr(t)), source, mem)
 		if debug {
 			fmt.Printf("\t\tsplitStore returns %s\n", x.LongString())
 		}
 		dst := offsetFrom(base, offset, types.NewPtr(t))
 		x := b.NewValue3A(pos, OpStore, types.TypeMem, t, dst, source, mem)
 		if debug {
 			fmt.Printf("\t\tstoreArg returns %s\n", x.LongString())
 		}
 		return x
 	}
@ -490,7 +665,7 @@ func expandCalls(f *Func) {
 				if debug {
 					fmt.Printf("storeArg %s, %v, %d\n", a.LongString(), aux.TypeOfArg(auxI), aux.OffsetOfArg(auxI))
 				}
-				mem = storeArg(pos, v.Block, a, aux.TypeOfArg(auxI), aux.OffsetOfArg(auxI), mem)
+				mem = storeArgOrLoad(pos, v.Block, sp, a, mem, aux.TypeOfArg(auxI), aux.OffsetOfArg(auxI))
 			}
 		}
 		v.resetArgs()
@ -523,7 +698,7 @@ func expandCalls(f *Func) {
 		t := name.Type
 		if isAlreadyExpandedAggregateType(t) {
 			for j, v := range f.NamedValues[name] {
-				if v.Op == OpSelectN {
+				if v.Op == OpSelectN || v.Op == OpArg && isAlreadyExpandedAggregateType(v.Type) {
 					ns := namedSelects[v]
 					namedSelects[v] = append(ns, namedVal{locIndex: i, valIndex: j})
 				}
@ -531,17 +706,19 @@ func expandCalls(f *Func) {
 		}
 	}
-	// Step 1: any stores of aggregates remaining are believed to be sourced from call results.
+	// Step 1: any stores of aggregates remaining are believed to be sourced from call results or args.
 	// Decompose those stores into a series of smaller stores, adding selection ops as necessary.
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
 			if v.Op == OpStore {
 				t := v.Aux.(*types.Type)
 				source := v.Args[1]
 				tSrc := source.Type
 				iAEATt := isAlreadyExpandedAggregateType(t)
 				if !iAEATt {
 					// guarding against store immediate struct into interface data field -- store type is *uint8
 					// TODO can this happen recursively?
 					tSrc := v.Args[1].Type
 					iAEATt = isAlreadyExpandedAggregateType(tSrc)
 					if iAEATt {
 						t = tSrc
@ -551,8 +728,8 @@ func expandCalls(f *Func) {
 					if debug {
 						fmt.Printf("Splitting store %s\n", v.LongString())
 					}
-					dst, source, mem := v.Args[0], v.Args[1], v.Args[2]
+					dst, mem := v.Args[0], v.Args[2]
-					mem = splitStore(dst, source, mem, v, t, 0, v.Pos)
+					mem = storeArgOrLoad(v.Pos, b, dst, source, mem, t, 0)
 					v.copyOf(mem)
 				}
 			}
@ -579,7 +756,7 @@ func expandCalls(f *Func) {
 				OpInt64Hi, OpInt64Lo:
 				w := v.Args[0]
 				switch w.Op {
-				case OpStructSelect, OpArraySelect, OpSelectN:
+				case OpStructSelect, OpArraySelect, OpSelectN, OpArg:
 					val2Preds[w] += 1
 					if debug {
 						fmt.Printf("v2p[%s] = %d\n", w.LongString(), val2Preds[w])
@ -595,6 +772,17 @@ func expandCalls(f *Func) {
 					}
 				}
 			case OpArg:
 				if !isAlreadyExpandedAggregateType(v.Type) {
 					continue
 				}
 				if _, ok := val2Preds[v]; !ok {
 					val2Preds[v] = 0
 					if debug {
 						fmt.Printf("v2p[%s] = %d\n", v.LongString(), val2Preds[v])
 					}
 				}
 			case OpSelectNAddr:
 				// Do these directly, there are no chains of selectors.
 				call := v.Args[0]
@ -612,7 +800,6 @@ func expandCalls(f *Func) {
 	// then forwards to rewrite selectors.
 	//
 	// All chains of selectors end up in same block as the call.
 	sdom := f.Sdom()
 	// Compilation must be deterministic, so sort after extracting first zeroes from map.
 	// Sorting allows dominators-last order within each batch,
@ -640,8 +827,11 @@ func expandCalls(f *Func) {
 		last = len(allOrdered)
 		sort.SliceStable(toProcess, less)
 		for _, v := range toProcess {
 			w := v.Args[0]
 			delete(val2Preds, v)
 			if v.Op == OpArg {
 				continue // no Args[0], hence done.
 			}
 			w := v.Args[0]
 			n, ok := val2Preds[w]
 			if !ok {
 				continue
@ -655,14 +845,20 @@ func expandCalls(f *Func) {
 		}
 	}
-	common := make(map[selKey]*Value)
+	common = make(map[selKey]*Value)
 	// Rewrite duplicate selectors as copies where possible.
 	for i := len(allOrdered) - 1; i >= 0; i-- {
 		v := allOrdered[i]
 		if v.Op == OpArg {
 			continue
 		}
 		w := v.Args[0]
 		if w.Op == OpCopy {
 			for w.Op == OpCopy {
 				w = w.Args[0]
 			}
 			v.SetArg(0, w)
 		}
 		typ := v.Type
 		if typ.IsMemory() {
 			continue // handled elsewhere, not an indexable result
@ -691,7 +887,7 @@ func expandCalls(f *Func) {
 		case OpComplexImag:
 			offset = size
 		}
-		sk := selKey{from: w, size: size, offset: offset, typ: typ.Etype}
+		sk := selKey{from: w, size: size, offset: offset, typ: typ}
 		dupe := common[sk]
 		if dupe == nil {
 			common[sk] = v
--- a/src/cmd/compile/internal/ssa/gen/dec64.rules
+++ b/src/cmd/compile/internal/ssa/gen/dec64.rules
@ -41,20 +41,21 @@
 		lo
 		(Store {hi.Type} dst hi mem))
-(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned() =>
+// These are not enabled during decomposeBuiltin if late call expansion, but they are always enabled for softFloat
 (Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin") =>
  (Int64Make
    (Arg <typ.Int32> {n} [off+4])
    (Arg <typ.UInt32> {n} [off]))
-(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned() =>
+(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")  =>
  (Int64Make
    (Arg <typ.UInt32> {n} [off+4])
    (Arg <typ.UInt32> {n} [off]))
-(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned() =>
+(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin") =>
  (Int64Make
    (Arg <typ.Int32> {n} [off])
    (Arg <typ.UInt32> {n} [off+4]))
-(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned() =>
+(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin") =>
  (Int64Make
    (Arg <typ.UInt32> {n} [off])
    (Arg <typ.UInt32> {n} [off+4]))
--- a/src/cmd/compile/internal/ssa/rewritedec64.go
+++ b/src/cmd/compile/internal/ssa/rewritedec64.go
@ -184,12 +184,12 @@ func rewriteValuedec64_OpArg(v *Value) bool {
 	config := b.Func.Config
 	typ := &b.Func.Config.Types
 	// match: (Arg {n} [off])
-	// cond: is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned()
+	// cond: is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")
 	// result: (Int64Make (Arg <typ.Int32> {n} [off+4]) (Arg <typ.UInt32> {n} [off]))
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		n := auxToSym(v.Aux)
-		if !(is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned()) {
+		if !(is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")) {
 			break
 		}
 		v.reset(OpInt64Make)
@ -203,12 +203,12 @@ func rewriteValuedec64_OpArg(v *Value) bool {
 		return true
 	}
 	// match: (Arg {n} [off])
-	// cond: is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned()
+	// cond: is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")
 	// result: (Int64Make (Arg <typ.UInt32> {n} [off+4]) (Arg <typ.UInt32> {n} [off]))
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		n := auxToSym(v.Aux)
-		if !(is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned()) {
+		if !(is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")) {
 			break
 		}
 		v.reset(OpInt64Make)
@ -222,12 +222,12 @@ func rewriteValuedec64_OpArg(v *Value) bool {
 		return true
 	}
 	// match: (Arg {n} [off])
-	// cond: is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned()
+	// cond: is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")
 	// result: (Int64Make (Arg <typ.Int32> {n} [off]) (Arg <typ.UInt32> {n} [off+4]))
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		n := auxToSym(v.Aux)
-		if !(is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned()) {
+		if !(is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")) {
 			break
 		}
 		v.reset(OpInt64Make)
@ -241,12 +241,12 @@ func rewriteValuedec64_OpArg(v *Value) bool {
 		return true
 	}
 	// match: (Arg {n} [off])
-	// cond: is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned()
+	// cond: is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")
 	// result: (Int64Make (Arg <typ.UInt32> {n} [off]) (Arg <typ.UInt32> {n} [off+4]))
 	for {
 		off := auxIntToInt32(v.AuxInt)
 		n := auxToSym(v.Aux)
-		if !(is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned()) {
+		if !(is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")) {
 			break
 		}
 		v.reset(OpInt64Make)
--- a/src/cmd/compile/internal/ssa/stackalloc.go
+++ b/src/cmd/compile/internal/ssa/stackalloc.go
@ -153,6 +153,9 @@ func (s *stackAllocState) stackalloc() {
 		if v.Op != OpArg {
 			continue
 		}
 		if v.Aux == nil {
 			f.Fatalf("%s has nil Aux\n", v.LongString())
 		}
 		loc := LocalSlot{N: v.Aux.(GCNode), Type: v.Type, Off: v.AuxInt}
 		if f.pass.debug > stackDebug {
 			fmt.Printf("stackalloc %s to %s\n", v, loc)