[dev.link] all: merge branch 'master' into dev.link

Clean merge. Change-Id: I94ac733fd3147abf42d89ccbfcc68f54ed5f4d13
2025-05-31 23:25:39 +00:00 · 2020-04-06 10:59:39 -04:00 · 2020-04-06 10:59:39 -04:00 · c8d89ddb18
commit c8d89ddb18
parent 6636b3f2fc a4451e1143
72 changed files with 1579 additions and 356 deletions
--- a/1
+++ b/1
@ -144,6 +144,7 @@ Andy Davis <andy@bigandian.com>
 Andy Finkenstadt <afinkenstadt@zynga.com>
 Andy Lindeman <andy@lindeman.io>
 Andy Maloney <asmaloney@gmail.com>
+Andy Pan <panjf2000@gmail.com>
 Andy Walker <walkeraj@gmail.com>
 Anfernee Yongkun Gui <anfernee.gui@gmail.com>
 Angelo Bulfone <mbulfone@gmail.com>
--- a/1
+++ b/1
@ -216,6 +216,7 @@ Andy Davis <andy@bigandian.com>
 Andy Finkenstadt <afinkenstadt@zynga.com>
 Andy Lindeman <andy@lindeman.io>
 Andy Maloney <asmaloney@gmail.com>
+Andy Pan <panjf2000@gmail.com>
 Andy Walker <walkeraj@gmail.com>
 Andzej Maciusovic <andzej.maciusovic@gmail.com>
 Anfernee Yongkun Gui <anfernee.gui@gmail.com>
--- a/doc/go1.15.html
+++ b/doc/go1.15.html
@ -119,6 +119,21 @@ TODO
 TODO
 </p>

+<dl id="flag"><dt><a href="/pkg/flag/">flag</a></dt>
+  <dd>
+    <p><!-- CL 221427 -->
+      When the flag package sees <code>-h</code> or <code>-help</code>, and
+      those flags are not defined, the flag package prints a usage message.
+      If the <a href=/pkg/flag/#FlagSet><code>FlagSet</code></a> was created with
+      <a href=/pkg/flag/#ExitOnError><code>ExitOnError</code></a>,
+      <a href=/pkg/flag/#FlagSet.Parse><code>FlagSet.Parse</code></a> would then
+      exit with a status of 2. In this release, the exit status for <code>-h</code>
+      or <code>-help</code> has been changed to 0. In particular, this applies to
+      the default handling of command line flags.
+    </p>
+  </dd>
+</dl>
+
 <dl id="pkg-runtime"><dt><a href="/pkg/runtime/">runtime</a></dt>
  <dd>
    <p><!-- CL 221779 -->
@ -133,23 +148,35 @@ TODO

 <dl id="sync"><dt><a href="/pkg/sync/">sync</a></dt>
  <dd>
-    <p><!-- golang.org/issue/33762 -->
+    <p><!-- CL 205899, golang.org/issue/33762 -->
      The new method
-      <a href="/pkg/sync#Map.LoadAndDelete"><code>Map.LoadAndDelete</code></a>
+      <a href="/pkg/sync/#Map.LoadAndDelete"><code>Map.LoadAndDelete</code></a>
      atomically deletes a key and returns the previous value if present.
    </p>
    <p><!-- CL 205899 -->
      The method
-      <a href="/pkg/sync#Map.Delete"><code>Map.Delete</code></a>
+      <a href="/pkg/sync/#Map.Delete"><code>Map.Delete</code></a>
      is more efficient.
    </p>
 </dl><!-- sync -->

+<dl id="testing"><dt><a href="/pkg/testing/">testing</a></dt>
+  <dd>
+    <p><!-- CL 226877, golang.org/issue/35998 -->
+       The new methods
+       <a href="/pkg/testing/#T.TempDir"><code>T.TempDir</code></a> and
+       <a href="/pkg/testing/#B.TempDir"><code>B.TempDir</code></a> and
+       return temporary directories that are automatically cleaned up
+       at the end of the test.
+    </p>
+  </dd>
+</dl><!-- testing -->
+
 <dl id="time"><dt><a href="/pkg/time/">time</a></dt>
  <dd>
-    <p><!-- golang.org/issue/33184 -->
+    <p><!-- CL 220424, CL 217362, golang.org/issue/33184 -->
       The new method
-       <a href="/pkg/time#Ticker.Reset"><code>Ticker.Reset</code></a>
+       <a href="/pkg/time/#Ticker.Reset"><code>Ticker.Reset</code></a>
       supports changing the duration of a ticker.
    </p>
  </dd>
--- a/src/cmd/api/goapi.go
+++ b/src/cmd/api/goapi.go
@ -60,8 +60,6 @@ var contexts = []*build.Context{
 	{GOOS: "linux", GOARCH: "amd64"},
 	{GOOS: "linux", GOARCH: "arm", CgoEnabled: true},
 	{GOOS: "linux", GOARCH: "arm"},
-	{GOOS: "darwin", GOARCH: "386", CgoEnabled: true},
-	{GOOS: "darwin", GOARCH: "386"},
 	{GOOS: "darwin", GOARCH: "amd64", CgoEnabled: true},
 	{GOOS: "darwin", GOARCH: "amd64"},
 	{GOOS: "windows", GOARCH: "amd64"},
@ -252,6 +250,13 @@ func featureWithoutContext(f string) string {
 	return spaceParensRx.ReplaceAllString(f, "")
 }

+// portRemoved reports whether the given port-specific API feature is
+// okay to no longer exist because its port was removed.
+func portRemoved(feature string) bool {
+	return strings.Contains(feature, "(darwin-386)") ||
+		strings.Contains(feature, "(darwin-386-cgo)")
+}
+
 func compareAPI(w io.Writer, features, required, optional, exception []string, allowAdd bool) (ok bool) {
 	ok = true

@ -279,6 +284,8 @@ func compareAPI(w io.Writer, features, required, optional, exception []string, a
 				// acknowledged by being in the file
 				// "api/except.txt". No need to print them out
 				// here.
+			} else if portRemoved(feature) {
+				// okay.
 			} else if featureSet[featureWithoutContext(feature)] {
 				// okay.
 			} else {
--- a/src/cmd/api/goapi_test.go
+++ b/src/cmd/api/goapi_test.go
@ -140,7 +140,6 @@ func TestCompareAPI(t *testing.T) {
 			name: "contexts reconverging",
 			required: []string{
 				"A",
-				"pkg syscall (darwin-386), type RawSockaddrInet6 struct",
 				"pkg syscall (darwin-amd64), type RawSockaddrInet6 struct",
 			},
 			features: []string{
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@ -902,6 +902,12 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = v.Args[0].Reg()
 		gc.AddrAuto(&p.To, v)
+	case ssa.OpAMD64LoweredHasCPUFeature:
+		p := s.Prog(x86.AMOVB)
+		p.From.Type = obj.TYPE_MEM
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = v.Reg()
 	case ssa.OpAMD64LoweredGetClosurePtr:
 		// Closure pointer is DX.
 		gc.CheckLoweredGetClosurePtr(v)
--- a/src/cmd/compile/internal/gc/inl.go
+++ b/src/cmd/compile/internal/gc/inl.go
@ -496,7 +496,14 @@ func inlcalls(fn *Node) {
 	if countNodes(fn) >= inlineBigFunctionNodes {
 		maxCost = inlineBigFunctionMaxCost
 	}
-	fn = inlnode(fn, maxCost)
+	// Map to keep track of functions that have been inlined at a particular
+	// call site, in order to stop inlining when we reach the beginning of a
+	// recursion cycle again. We don't inline immediately recursive functions,
+	// but allow inlining if there is a recursion cycle of many functions.
+	// Most likely, the inlining will stop before we even hit the beginning of
+	// the cycle again, but the map catches the unusual case.
+	inlMap := make(map[*Node]bool)
+	fn = inlnode(fn, maxCost, inlMap)
 	if fn != Curfn {
 		Fatalf("inlnode replaced curfn")
 	}
@ -537,10 +544,10 @@ func inlconv2list(n *Node) []*Node {
 	return s
 }

-func inlnodelist(l Nodes, maxCost int32) {
+func inlnodelist(l Nodes, maxCost int32, inlMap map[*Node]bool) {
 	s := l.Slice()
 	for i := range s {
-		s[i] = inlnode(s[i], maxCost)
+		s[i] = inlnode(s[i], maxCost, inlMap)
 	}
 }

@ -557,7 +564,7 @@ func inlnodelist(l Nodes, maxCost int32) {
 // shorter and less complicated.
 // The result of inlnode MUST be assigned back to n, e.g.
 // 	n.Left = inlnode(n.Left)
-func inlnode(n *Node, maxCost int32) *Node {
+func inlnode(n *Node, maxCost int32, inlMap map[*Node]bool) *Node {
 	if n == nil {
 		return n
 	}
@ -585,19 +592,19 @@ func inlnode(n *Node, maxCost int32) *Node {

 	lno := setlineno(n)

-	inlnodelist(n.Ninit, maxCost)
+	inlnodelist(n.Ninit, maxCost, inlMap)
 	for _, n1 := range n.Ninit.Slice() {
 		if n1.Op == OINLCALL {
 			inlconv2stmt(n1)
 		}
 	}

-	n.Left = inlnode(n.Left, maxCost)
+	n.Left = inlnode(n.Left, maxCost, inlMap)
 	if n.Left != nil && n.Left.Op == OINLCALL {
 		n.Left = inlconv2expr(n.Left)
 	}

-	n.Right = inlnode(n.Right, maxCost)
+	n.Right = inlnode(n.Right, maxCost, inlMap)
 	if n.Right != nil && n.Right.Op == OINLCALL {
 		if n.Op == OFOR || n.Op == OFORUNTIL {
 			inlconv2stmt(n.Right)
@ -612,7 +619,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 		}
 	}

-	inlnodelist(n.List, maxCost)
+	inlnodelist(n.List, maxCost, inlMap)
 	if n.Op == OBLOCK {
 		for _, n2 := range n.List.Slice() {
 			if n2.Op == OINLCALL {
@ -628,7 +635,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 		}
 	}

-	inlnodelist(n.Rlist, maxCost)
+	inlnodelist(n.Rlist, maxCost, inlMap)
 	s := n.Rlist.Slice()
 	for i1, n1 := range s {
 		if n1.Op == OINLCALL {
@ -640,7 +647,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 		}
 	}

-	inlnodelist(n.Nbody, maxCost)
+	inlnodelist(n.Nbody, maxCost, inlMap)
 	for _, n := range n.Nbody.Slice() {
 		if n.Op == OINLCALL {
 			inlconv2stmt(n)
@ -663,12 +670,12 @@ func inlnode(n *Node, maxCost int32) *Node {
 			fmt.Printf("%v:call to func %+v\n", n.Line(), n.Left)
 		}
 		if n.Left.Func != nil && n.Left.Func.Inl != nil && !isIntrinsicCall(n) { // normal case
-			n = mkinlcall(n, n.Left, maxCost)
+			n = mkinlcall(n, n.Left, maxCost, inlMap)
 		} else if n.Left.isMethodExpression() && asNode(n.Left.Sym.Def) != nil {
-			n = mkinlcall(n, asNode(n.Left.Sym.Def), maxCost)
+			n = mkinlcall(n, asNode(n.Left.Sym.Def), maxCost, inlMap)
 		} else if n.Left.Op == OCLOSURE {
 			if f := inlinableClosure(n.Left); f != nil {
-				n = mkinlcall(n, f, maxCost)
+				n = mkinlcall(n, f, maxCost, inlMap)
 			}
 		} else if n.Left.Op == ONAME && n.Left.Name != nil && n.Left.Name.Defn != nil {
 			if d := n.Left.Name.Defn; d.Op == OAS && d.Right.Op == OCLOSURE {
@ -694,7 +701,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 						}
 						break
 					}
-					n = mkinlcall(n, f, maxCost)
+					n = mkinlcall(n, f, maxCost, inlMap)
 				}
 			}
 		}
@ -713,7 +720,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 			Fatalf("no function definition for [%p] %+v\n", n.Left.Type, n.Left.Type)
 		}

-		n = mkinlcall(n, asNode(n.Left.Type.FuncType().Nname), maxCost)
+		n = mkinlcall(n, asNode(n.Left.Type.FuncType().Nname), maxCost, inlMap)
 	}

 	lineno = lno
@ -833,7 +840,7 @@ var inlgen int
 // parameters.
 // The result of mkinlcall MUST be assigned back to n, e.g.
 // 	n.Left = mkinlcall(n.Left, fn, isddd)
-func mkinlcall(n, fn *Node, maxCost int32) *Node {
+func mkinlcall(n, fn *Node, maxCost int32, inlMap map[*Node]bool) *Node {
 	if fn.Func.Inl == nil {
 		// No inlinable body.
 		return n
@ -866,6 +873,16 @@ func mkinlcall(n, fn *Node, maxCost int32) *Node {
 		return n
 	}

+	if inlMap[fn] {
+		if Debug['m'] > 1 {
+			fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", n.Line(), fn, Curfn.funcname())
+		}
+		return n
+	}
+	inlMap[fn] = true
+	defer func() {
+		inlMap[fn] = false
+	}()
 	if Debug_typecheckinl == 0 {
 		typecheckinl(fn)
 	}
@ -1129,7 +1146,7 @@ func mkinlcall(n, fn *Node, maxCost int32) *Node {
 	// instead we emit the things that the body needs
 	// and each use must redo the inlining.
 	// luckily these are small.
-	inlnodelist(call.Nbody, maxCost)
+	inlnodelist(call.Nbody, maxCost, inlMap)
 	for _, n := range call.Nbody.Slice() {
 		if n.Op == OINLCALL {
 			inlconv2stmt(n)
--- a/src/cmd/compile/internal/gc/main.go
+++ b/src/cmd/compile/internal/gc/main.go
@ -379,9 +379,8 @@ func Main(archInit func(*Arch)) {
 	if flag_race && flag_msan {
 		log.Fatal("cannot use both -race and -msan")
 	}
-	if (flag_race || flag_msan) && objabi.GOOS != "windows" {
-		// -race and -msan imply -d=checkptr for now (except on windows).
-		// TODO(mdempsky): Re-evaluate before Go 1.14. See #34964.
+	if flag_race || flag_msan {
+		// -race and -msan imply -d=checkptr for now.
 		Debug_checkptr = 1
 	}
 	if ispkgin(omit_pkgs) {
@ -679,8 +678,12 @@ func Main(archInit func(*Arch)) {
 	if Debug['l'] != 0 {
 		// Find functions that can be inlined and clone them before walk expands them.
 		visitBottomUp(xtop, func(list []*Node, recursive bool) {
+			numfns := numNonClosures(list)
 			for _, n := range list {
-				if !recursive {
+				if !recursive || numfns > 1 {
+					// We allow inlining if there is no
+					// recursion, or the recursion cycle is
+					// across more than one function.
 					caninl(n)
 				} else {
 					if Debug['m'] > 1 {
@ -824,6 +827,17 @@ func Main(archInit func(*Arch)) {
 	}
 }

+// numNonClosures returns the number of functions in list which are not closures.
+func numNonClosures(list []*Node) int {
+	count := 0
+	for _, n := range list {
+		if n.Func.Closure == nil {
+			count++
+		}
+	}
+	return count
+}
+
 func writebench(filename string) error {
 	f, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
 	if err != nil {
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@ -339,7 +339,7 @@ func buildssa(fn *Node, worker int) *ssa.Func {
 	s.softFloat = s.config.SoftFloat

 	if printssa {
-		s.f.HTMLWriter = ssa.NewHTMLWriter(ssaDumpFile, s.f.Frontend(), name, ssaDumpCFG)
+		s.f.HTMLWriter = ssa.NewHTMLWriter(ssaDumpFile, s.f, ssaDumpCFG)
 		// TODO: generate and print a mapping from nodes to values and blocks
 		dumpSourcesColumn(s.f.HTMLWriter, fn)
 		s.f.HTMLWriter.WriteAST("AST", astBuf)
@ -471,7 +471,7 @@ func dumpSourcesColumn(writer *ssa.HTMLWriter, fn *Node) {
 	fname := Ctxt.PosTable.Pos(fn.Pos).Filename()
 	targetFn, err := readFuncLines(fname, fn.Pos.Line(), fn.Func.Endlineno.Line())
 	if err != nil {
-		writer.Logger.Logf("cannot read sources for function %v: %v", fn, err)
+		writer.Logf("cannot read sources for function %v: %v", fn, err)
 	}

 	// Read sources of inlined functions.
@ -487,7 +487,7 @@ func dumpSourcesColumn(writer *ssa.HTMLWriter, fn *Node) {
 		fname := Ctxt.PosTable.Pos(fi.Pos).Filename()
 		fnLines, err := readFuncLines(fname, fi.Pos.Line(), elno.Line())
 		if err != nil {
-			writer.Logger.Logf("cannot read sources for function %v: %v", fi, err)
+			writer.Logf("cannot read sources for inlined function %v: %v", fi, err)
 			continue
 		}
 		inlFns = append(inlFns, fnLines)
@ -3595,8 +3595,7 @@ func init() {
 				s.vars[n] = s.load(types.Types[TFLOAT64], a)
 				return s.variable(n, types.Types[TFLOAT64])
 			}
-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasFMA, s.sb)
-			v := s.load(types.Types[TBOOL], addr)
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasFMA)
 			b := s.endBlock()
 			b.Kind = ssa.BlockIf
 			b.SetControl(v)
@ -3661,8 +3660,7 @@ func init() {

 	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 		return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasSSE41, s.sb)
-			v := s.load(types.Types[TBOOL], addr)
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasSSE41)
 			b := s.endBlock()
 			b.Kind = ssa.BlockIf
 			b.SetControl(v)
@ -3869,8 +3867,7 @@ func init() {

 	makeOnesCountAMD64 := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 		return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasPOPCNT, s.sb)
-			v := s.load(types.Types[TBOOL], addr)
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasPOPCNT)
 			b := s.endBlock()
 			b.Kind = ssa.BlockIf
 			b.SetControl(v)
--- a/src/cmd/compile/internal/logopt/logopt_test.go
+++ b/src/cmd/compile/internal/logopt/logopt_test.go
@ -38,6 +38,12 @@ func want(t *testing.T, out string, desired string) {
 	}
 }

+func wantN(t *testing.T, out string, desired string, n int) {
+	if strings.Count(out, desired) != n {
+		t.Errorf("expected exactly %d occurences of %s in \n%s", n, desired, out)
+	}
+}
+
 func TestLogOpt(t *testing.T) {
 	t.Parallel()

@ -75,7 +81,70 @@ func TestLogOpt(t *testing.T) {

 	})

+	// replace d (dir)  with t ("tmpdir") and convert path separators to '/'
+	normalize := func(out []byte, d, t string) string {
+		s := string(out)
+		s = strings.ReplaceAll(s, d, t)
+		s = strings.ReplaceAll(s, string(os.PathSeparator), "/")
+		return s
+	}
+
+	// Ensure that <128 byte copies are not reported and that 128-byte copies are.
+	// Check at both 1 and 8-byte alignments.
+	t.Run("Copy", func(t *testing.T) {
+		const copyCode = `package x
+func s128a1(x *[128]int8) [128]int8 { 
+	return *x
+}
+func s127a1(x *[127]int8) [127]int8 {
+	return *x
+}
+func s16a8(x *[16]int64) [16]int64 {
+	return *x
+}
+func s15a8(x *[15]int64) [15]int64 {
+	return *x
+}
+`
+		copy := filepath.Join(dir, "copy.go")
+		if err := ioutil.WriteFile(copy, []byte(copyCode), 0644); err != nil {
+			t.Fatal(err)
+		}
+		outcopy := filepath.Join(dir, "copy.o")
+
+		// On not-amd64, test the host architecture and os
+		arches := []string{runtime.GOARCH}
+		goos0 := runtime.GOOS
+		if runtime.GOARCH == "amd64" { // Test many things with "linux" (wasm will get "js")
+			arches = []string{"arm", "arm64", "386", "amd64", "mips", "mips64", "ppc64le", "s390x", "wasm"}
+			goos0 = "linux"
+		}
+
+		for _, arch := range arches {
+			t.Run(arch, func(t *testing.T) {
+				goos := goos0
+				if arch == "wasm" {
+					goos = "js"
+				}
+				_, err := testCopy(t, dir, arch, goos, copy, outcopy)
+				if err != nil {
+					t.Error("-json=0,file://log/opt should have succeeded")
+				}
+				logged, err := ioutil.ReadFile(filepath.Join(dir, "log", "opt", "x", "copy.json"))
+				if err != nil {
+					t.Error("-json=0,file://log/opt missing expected log file")
+				}
+				slogged := normalize(logged, string(uriIfy(dir)), string(uriIfy("tmpdir")))
+				t.Logf("%s", slogged)
+				want(t, slogged, `{"range":{"start":{"line":3,"character":2},"end":{"line":3,"character":2}},"severity":3,"code":"copy","source":"go compiler","message":"128 bytes"}`)
+				want(t, slogged, `{"range":{"start":{"line":9,"character":2},"end":{"line":9,"character":2}},"severity":3,"code":"copy","source":"go compiler","message":"128 bytes"}`)
+				wantN(t, slogged, `"code":"copy"`, 2)
+			})
+		}
+	})
+
 	// Some architectures don't fault on nil dereference, so nilchecks are eliminated differently.
+	// The N-way copy test also doesn't need to run N-ways N times.
 	if runtime.GOARCH != "amd64" {
 		return
 	}
@ -83,14 +152,6 @@ func TestLogOpt(t *testing.T) {
 	t.Run("Success", func(t *testing.T) {
 		// This test is supposed to succeed

-		// replace d (dir)  with t ("tmpdir") and convert path separators to '/'
-		normalize := func(out []byte, d, t string) string {
-			s := string(out)
-			s = strings.ReplaceAll(s, d, t)
-			s = strings.ReplaceAll(s, string(os.PathSeparator), "/")
-			return s
-		}
-
 		// Note 'file://' is the I-Know-What-I-Am-Doing way of specifying a file, also to deal with corner cases for Windows.
 		_, err := testLogOptDir(t, dir, "-json=0,file://log/opt", src, outfile)
 		if err != nil {
@ -131,3 +192,15 @@ func testLogOptDir(t *testing.T, dir, flag, src, outfile string) (string, error)
 	t.Logf("%s", out)
 	return string(out), err
 }
+
+func testCopy(t *testing.T, dir, goarch, goos, src, outfile string) (string, error) {
+	// Notice the specified import path "x"
+	run := []string{testenv.GoToolPath(t), "tool", "compile", "-p", "x", "-json=0,file://log/opt", "-o", outfile, src}
+	t.Log(run)
+	cmd := exec.Command(run[0], run[1:]...)
+	cmd.Dir = dir
+	cmd.Env = []string{"GOARCH=" + goarch, "GOOS=" + goos}
+	out, err := cmd.CombinedOutput()
+	t.Logf("%s", out)
+	return string(out), err
+}
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@ -850,39 +850,226 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = v.AuxInt & 3

-	case ssa.OpPPC64LoweredZero:
+	case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort:
+		// The LoweredQuad code generation
+		// generates STXV instructions on
+		// power9. The Short variation is used
+		// if no loop is generated.

-		// unaligned data doesn't hurt performance
-		// for these instructions on power8 or later
+		// sizes >= 64 generate a loop as follows:

-		// for sizes >= 64 generate a loop as follows:
+		// Set up loop counter in CTR, used by BC
+		// XXLXOR clears VS32
+		//       XXLXOR VS32,VS32,VS32
+		//       MOVD len/64,REG_TMP
+		//       MOVD REG_TMP,CTR
+		//       loop:
+		//       STXV VS32,0(R20)
+		//       STXV VS32,16(R20)
+		//       STXV VS32,32(R20)
+		//       STXV VS32,48(R20)
+		//       ADD  $64,R20
+		//       BC   16, 0, loop

-		// set up loop counter in CTR, used by BC
+		// Bytes per iteration
+		ctr := v.AuxInt / 64
+
+		// Remainder bytes
+		rem := v.AuxInt % 64
+
+		// Only generate a loop if there is more
+		// than 1 iteration.
+		if ctr > 1 {
+			// Set up VS32 (V0) to hold 0s
+			p := s.Prog(ppc64.AXXLXOR)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+			p.Reg = ppc64.REG_VS32
+
+			// Set up CTR loop counter
+			p = s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = ctr
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REGTMP
+
+			p = s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REGTMP
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_CTR
+
+			// Don't generate padding for
+			// loops with few iterations.
+			if ctr > 3 {
+				p = s.Prog(obj.APCALIGN)
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = 16
+			}
+
+			// generate 4 STXVs to zero 64 bytes
+			var top *obj.Prog
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+
+			//  Save the top of loop
+			if top == nil {
+				top = p
+			}
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = 16
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = 32
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = 48
+
+			// Increment address for the
+			// 64 bytes just zeroed.
+			p = s.Prog(ppc64.AADD)
+			p.Reg = v.Args[0].Reg()
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = 64
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = v.Args[0].Reg()
+
+			// Branch back to top of loop
+			// based on CTR
+			// BC with BO_BCTR generates bdnz
+			p = s.Prog(ppc64.ABC)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = ppc64.BO_BCTR
+			p.Reg = ppc64.REG_R0
+			p.To.Type = obj.TYPE_BRANCH
+			gc.Patch(p, top)
+		}
+		// When ctr == 1 the loop was not generated but
+		// there are at least 64 bytes to clear, so add
+		// that to the remainder to generate the code
+		// to clear those doublewords
+		if ctr == 1 {
+			rem += 64
+		}
+
+		// Clear the remainder starting at offset zero
+		offset := int64(0)
+
+		if rem >= 16 && ctr <= 1 {
+			// If the XXLXOR hasn't already been
+			// generated, do it here to initialize
+			// VS32 (V0) to 0.
+			p := s.Prog(ppc64.AXXLXOR)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+			p.Reg = ppc64.REG_VS32
+		}
+		// Generate STXV for 32 or 64
+		// bytes.
+		for rem >= 32 {
+			p := s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = offset
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = offset + 16
+			offset += 32
+			rem -= 32
+		}
+		// Generate 16 bytes
+		if rem >= 16 {
+			p := s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = offset
+			offset += 16
+			rem -= 16
+		}
+
+		// first clear as many doublewords as possible
+		// then clear remaining sizes as available
+		for rem > 0 {
+			op, size := ppc64.AMOVB, int64(1)
+			switch {
+			case rem >= 8:
+				op, size = ppc64.AMOVD, 8
+			case rem >= 4:
+				op, size = ppc64.AMOVW, 4
+			case rem >= 2:
+				op, size = ppc64.AMOVH, 2
+			}
+			p := s.Prog(op)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_R0
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = offset
+			rem -= size
+			offset += size
+		}
+
+	case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort:
+
+		// Unaligned data doesn't hurt performance
+		// for these instructions on power8.
+
+		// For sizes >= 64 generate a loop as follows:
+
+		// Set up loop counter in CTR, used by BC
 		//       XXLXOR VS32,VS32,VS32
 		//	 MOVD len/32,REG_TMP
 		//	 MOVD REG_TMP,CTR
 		//       MOVD $16,REG_TMP
 		//	 loop:
-		//	 STXVD2X VS32,(R0)(R3)
-		//	 STXVD2X VS32,(R31)(R3)
-		//	 ADD  $32,R3
+		//	 STXVD2X VS32,(R0)(R20)
+		//	 STXVD2X VS32,(R31)(R20)
+		//	 ADD  $32,R20
 		//	 BC   16, 0, loop
 		//
 		// any remainder is done as described below

 		// for sizes < 64 bytes, first clear as many doublewords as possible,
 		// then handle the remainder
-		//	MOVD R0,(R3)
-		//	MOVD R0,8(R3)
+		//	MOVD R0,(R20)
+		//	MOVD R0,8(R20)
 		// .... etc.
 		//
 		// the remainder bytes are cleared using one or more
 		// of the following instructions with the appropriate
 		// offsets depending which instructions are needed
 		//
-		//	MOVW R0,n1(R3)	4 bytes
-		//	MOVH R0,n2(R3)	2 bytes
-		//	MOVB R0,n3(R3)	1 byte
+		//	MOVW R0,n1(R20)	4 bytes
+		//	MOVH R0,n2(R20)	2 bytes
+		//	MOVB R0,n3(R20)	1 byte
 		//
 		// 7 bytes: MOVW, MOVH, MOVB
 		// 6 bytes: MOVW, MOVH
@ -926,10 +1113,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REGTMP

+			// Don't add padding for alignment
+			// with few loop iterations.
+			if ctr > 3 {
+				p = s.Prog(obj.APCALIGN)
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = 16
+			}
+
 			// generate 2 STXVD2Xs to store 16 bytes
 			// when this is a loop then the top must be saved
 			var top *obj.Prog
 			// This is the top of loop
+
 			p = s.Prog(ppc64.ASTXVD2X)
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS32
@ -940,7 +1136,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			if top == nil {
 				top = p
 			}
-
 			p = s.Prog(ppc64.ASTXVD2X)
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS32
@ -1001,8 +1196,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			offset += size
 		}

-	case ssa.OpPPC64LoweredMove:
+	case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort:

+		bytesPerLoop := int64(32)
 		// This will be used when moving more
 		// than 8 bytes.  Moves start with
 		// as many 8 byte moves as possible, then
@ -1019,34 +1215,34 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		//	MOVD REG_TMP,CTR
 		//	MOVD $16,REG_TMP
 		// top:
-		//	LXVD2X (R0)(R4),VS32
-		//	LXVD2X (R31)(R4),VS33
-		//	ADD $32,R4
-		//	STXVD2X VS32,(R0)(R3)
-		//	STXVD2X VS33,(R31)(R4)
-		//	ADD $32,R3
+		//	LXVD2X (R0)(R21),VS32
+		//	LXVD2X (R31)(R21),VS33
+		//	ADD $32,R21
+		//	STXVD2X VS32,(R0)(R20)
+		//	STXVD2X VS33,(R31)(R20)
+		//	ADD $32,R20
 		//	BC 16,0,top
 		// Bytes not moved by this loop are moved
 		// with a combination of the following instructions,
 		// starting with the largest sizes and generating as
 		// many as needed, using the appropriate offset value.
-		//	MOVD  n(R4),R14
-		//	MOVD  R14,n(R3)
-		//	MOVW  n1(R4),R14
-		//	MOVW  R14,n1(R3)
-		//	MOVH  n2(R4),R14
-		//	MOVH  R14,n2(R3)
-		//	MOVB  n3(R4),R14
-		//	MOVB  R14,n3(R3)
+		//	MOVD  n(R21),R31
+		//	MOVD  R31,n(R20)
+		//	MOVW  n1(R21),R31
+		//	MOVW  R31,n1(R20)
+		//	MOVH  n2(R21),R31
+		//	MOVH  R31,n2(R20)
+		//	MOVB  n3(R21),R31
+		//	MOVB  R31,n3(R20)

 		// Each loop iteration moves 32 bytes
-		ctr := v.AuxInt / 32
+		ctr := v.AuxInt / bytesPerLoop

 		// Remainder after the loop
-		rem := v.AuxInt % 32
+		rem := v.AuxInt % bytesPerLoop

-		dst_reg := v.Args[0].Reg()
-		src_reg := v.Args[1].Reg()
+		dstReg := v.Args[0].Reg()
+		srcReg := v.Args[1].Reg()

 		// The set of registers used here, must match the clobbered reg list
 		// in PPC64Ops.go.
@ -1076,57 +1272,65 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REGTMP

+			// Don't adding padding for
+			// alignment with small iteration
+			// counts.
+			if ctr > 3 {
+				p = s.Prog(obj.APCALIGN)
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = 16
+			}
+
 			// Generate 16 byte loads and stores.
 			// Use temp register for index (16)
 			// on the second one.
+
 			p = s.Prog(ppc64.ALXVD2X)
 			p.From.Type = obj.TYPE_MEM
-			p.From.Reg = src_reg
+			p.From.Reg = srcReg
 			p.From.Index = ppc64.REGZERO
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REG_VS32
-
 			if top == nil {
 				top = p
 			}
-
 			p = s.Prog(ppc64.ALXVD2X)
 			p.From.Type = obj.TYPE_MEM
-			p.From.Reg = src_reg
+			p.From.Reg = srcReg
 			p.From.Index = ppc64.REGTMP
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REG_VS33

 			// increment the src reg for next iteration
 			p = s.Prog(ppc64.AADD)
-			p.Reg = src_reg
+			p.Reg = srcReg
 			p.From.Type = obj.TYPE_CONST
-			p.From.Offset = 32
+			p.From.Offset = bytesPerLoop
 			p.To.Type = obj.TYPE_REG
-			p.To.Reg = src_reg
+			p.To.Reg = srcReg

 			// generate 16 byte stores
 			p = s.Prog(ppc64.ASTXVD2X)
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS32
 			p.To.Type = obj.TYPE_MEM
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg
 			p.To.Index = ppc64.REGZERO

 			p = s.Prog(ppc64.ASTXVD2X)
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS33
 			p.To.Type = obj.TYPE_MEM
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg
 			p.To.Index = ppc64.REGTMP

 			// increment the dst reg for next iteration
 			p = s.Prog(ppc64.AADD)
-			p.Reg = dst_reg
+			p.Reg = dstReg
 			p.From.Type = obj.TYPE_CONST
-			p.From.Offset = 32
+			p.From.Offset = bytesPerLoop
 			p.To.Type = obj.TYPE_REG
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg

 			// BC with BO_BCTR generates bdnz to branch on nonzero CTR
 			// to loop top.
@ -1137,7 +1341,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Type = obj.TYPE_BRANCH
 			gc.Patch(p, top)

-			// src_reg and dst_reg were incremented in the loop, so
+			// srcReg and dstReg were incremented in the loop, so
 			// later instructions start with offset 0.
 			offset = int64(0)
 		}
@ -1145,7 +1349,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		// No loop was generated for one iteration, so
 		// add 32 bytes to the remainder to move those bytes.
 		if ctr == 1 {
-			rem += 32
+			rem += bytesPerLoop
 		}

 		if rem >= 16 {
@ -1154,7 +1358,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			// on the second one.
 			p := s.Prog(ppc64.ALXVD2X)
 			p.From.Type = obj.TYPE_MEM
-			p.From.Reg = src_reg
+			p.From.Reg = srcReg
 			p.From.Index = ppc64.REGZERO
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REG_VS32
@ -1163,7 +1367,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS32
 			p.To.Type = obj.TYPE_MEM
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg
 			p.To.Index = ppc64.REGZERO

 			offset = 16
@ -1171,18 +1375,15 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {

 			if rem >= 16 {
 				// Use REGTMP as index reg
-				p = s.Prog(ppc64.AMOVD)
+				p := s.Prog(ppc64.AMOVD)
 				p.From.Type = obj.TYPE_CONST
 				p.From.Offset = 16
 				p.To.Type = obj.TYPE_REG
 				p.To.Reg = ppc64.REGTMP

-				// Generate 16 byte loads and stores.
-				// Use temp register for index (16)
-				// on the second one.
 				p = s.Prog(ppc64.ALXVD2X)
 				p.From.Type = obj.TYPE_MEM
-				p.From.Reg = src_reg
+				p.From.Reg = srcReg
 				p.From.Index = ppc64.REGTMP
 				p.To.Type = obj.TYPE_REG
 				p.To.Reg = ppc64.REG_VS32
@ -1191,7 +1392,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 				p.From.Type = obj.TYPE_REG
 				p.From.Reg = ppc64.REG_VS32
 				p.To.Type = obj.TYPE_MEM
-				p.To.Reg = dst_reg
+				p.To.Reg = dstReg
 				p.To.Index = ppc64.REGTMP

 				offset = 32
@ -1214,17 +1415,284 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			// Load
 			p := s.Prog(op)
 			p.To.Type = obj.TYPE_REG
-			p.To.Reg = ppc64.REG_R14
+			p.To.Reg = ppc64.REGTMP
 			p.From.Type = obj.TYPE_MEM
-			p.From.Reg = src_reg
+			p.From.Reg = srcReg
 			p.From.Offset = offset

 			// Store
 			p = s.Prog(op)
 			p.From.Type = obj.TYPE_REG
-			p.From.Reg = ppc64.REG_R14
+			p.From.Reg = ppc64.REGTMP
 			p.To.Type = obj.TYPE_MEM
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg
+			p.To.Offset = offset
+			rem -= size
+			offset += size
+		}
+
+	case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort:
+		bytesPerLoop := int64(64)
+		// This is used when moving more
+		// than 8 bytes on power9.  Moves start with
+		// as many 8 byte moves as possible, then
+		// 4, 2, or 1 byte(s) as remaining.  This will
+		// work and be efficient for power8 or later.
+		// If there are 64 or more bytes, then a
+		// loop is generated to move 32 bytes and
+		// update the src and dst addresses on each
+		// iteration. When < 64 bytes, the appropriate
+		// number of moves are generated based on the
+		// size.
+		// When moving >= 64 bytes a loop is used
+		//      MOVD len/32,REG_TMP
+		//      MOVD REG_TMP,CTR
+		// top:
+		//      LXV 0(R21),VS32
+		//      LXV 16(R21),VS33
+		//      ADD $32,R21
+		//      STXV VS32,0(R20)
+		//      STXV VS33,16(R20)
+		//      ADD $32,R20
+		//      BC 16,0,top
+		// Bytes not moved by this loop are moved
+		// with a combination of the following instructions,
+		// starting with the largest sizes and generating as
+		// many as needed, using the appropriate offset value.
+		//      MOVD  n(R21),R31
+		//      MOVD  R31,n(R20)
+		//      MOVW  n1(R21),R31
+		//      MOVW  R31,n1(R20)
+		//      MOVH  n2(R21),R31
+		//      MOVH  R31,n2(R20)
+		//      MOVB  n3(R21),R31
+		//      MOVB  R31,n3(R20)
+
+		// Each loop iteration moves 32 bytes
+		ctr := v.AuxInt / bytesPerLoop
+
+		// Remainder after the loop
+		rem := v.AuxInt % bytesPerLoop
+
+		dstReg := v.Args[0].Reg()
+		srcReg := v.Args[1].Reg()
+
+		offset := int64(0)
+
+		// top of the loop
+		var top *obj.Prog
+
+		// Only generate looping code when loop counter is > 1 for >= 64 bytes
+		if ctr > 1 {
+			// Set up the CTR
+			p := s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = ctr
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REGTMP
+
+			p = s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REGTMP
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_CTR
+
+			p = s.Prog(obj.APCALIGN)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = 16
+
+			// Generate 16 byte loads and stores.
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+			if top == nil {
+				top = p
+			}
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset + 16
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS33
+
+			// generate 16 byte stores
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS33
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset + 16
+
+			// Generate 16 byte loads and stores.
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset + 32
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset + 48
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS33
+
+			// generate 16 byte stores
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset + 32
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS33
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset + 48
+
+			// increment the src reg for next iteration
+			p = s.Prog(ppc64.AADD)
+			p.Reg = srcReg
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = bytesPerLoop
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = srcReg
+
+			// increment the dst reg for next iteration
+			p = s.Prog(ppc64.AADD)
+			p.Reg = dstReg
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = bytesPerLoop
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = dstReg
+
+			// BC with BO_BCTR generates bdnz to branch on nonzero CTR
+			// to loop top.
+			p = s.Prog(ppc64.ABC)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = ppc64.BO_BCTR
+			p.Reg = ppc64.REG_R0
+			p.To.Type = obj.TYPE_BRANCH
+			gc.Patch(p, top)
+
+			// srcReg and dstReg were incremented in the loop, so
+			// later instructions start with offset 0.
+			offset = int64(0)
+		}
+
+		// No loop was generated for one iteration, so
+		// add 32 bytes to the remainder to move those bytes.
+		if ctr == 1 {
+			rem += bytesPerLoop
+		}
+		if rem >= 32 {
+			p := s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = 16
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS33
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS33
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = 16
+
+			offset = 32
+			rem -= 32
+		}
+
+		if rem >= 16 {
+			// Generate 16 byte loads and stores.
+			p := s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset
+
+			offset += 16
+			rem -= 16
+
+			if rem >= 16 {
+				p := s.Prog(ppc64.ALXV)
+				p.From.Type = obj.TYPE_MEM
+				p.From.Reg = srcReg
+				p.From.Offset = offset
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = ppc64.REG_VS32
+
+				p = s.Prog(ppc64.ASTXV)
+				p.From.Type = obj.TYPE_REG
+				p.From.Reg = ppc64.REG_VS32
+				p.To.Type = obj.TYPE_MEM
+				p.To.Reg = dstReg
+				p.To.Offset = offset
+
+				offset += 16
+				rem -= 16
+			}
+		}
+		// Generate all the remaining load and store pairs, starting with
+		// as many 8 byte moves as possible, then 4, 2, 1.
+		for rem > 0 {
+			op, size := ppc64.AMOVB, int64(1)
+			switch {
+			case rem >= 8:
+				op, size = ppc64.AMOVD, 8
+			case rem >= 4:
+				op, size = ppc64.AMOVW, 4
+			case rem >= 2:
+				op, size = ppc64.AMOVH, 2
+			}
+			// Load
+			p := s.Prog(op)
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REGTMP
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset
+
+			// Store
+			p = s.Prog(op)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REGTMP
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
 			p.To.Offset = offset
 			rem -= size
 			offset += size
--- a/src/cmd/compile/internal/ssa/block.go
+++ b/src/cmd/compile/internal/ssa/block.go
@ -124,15 +124,8 @@ func (b *Block) LongString() string {
 	if b.Aux != nil {
 		s += fmt.Sprintf(" {%s}", b.Aux)
 	}
-	if t := b.Kind.AuxIntType(); t != "" {
-		switch t {
-		case "Int8":
-			s += fmt.Sprintf(" [%v]", int8(b.AuxInt))
-		case "UInt8":
-			s += fmt.Sprintf(" [%v]", uint8(b.AuxInt))
-		default:
-			s += fmt.Sprintf(" [%v]", b.AuxInt)
-		}
+	if t := b.AuxIntString(); t != "" {
+		s += fmt.Sprintf(" [%s]", t)
 	}
 	for _, c := range b.ControlValues() {
 		s += fmt.Sprintf(" %s", c)
@ -341,6 +334,19 @@ func (b *Block) LackingPos() bool {
 	return true
 }

+func (b *Block) AuxIntString() string {
+	switch b.Kind.AuxIntType() {
+	case "Int8":
+		return fmt.Sprintf("%v", int8(b.AuxInt))
+	case "UInt8":
+		return fmt.Sprintf("%v", uint8(b.AuxInt))
+	default: // type specified but not implemented - print as int64
+		return fmt.Sprintf("%v", b.AuxInt)
+	case "": // no aux int type
+		return ""
+	}
+}
+
 func (b *Block) Logf(msg string, args ...interface{})   { b.Func.Logf(msg, args...) }
 func (b *Block) Log() bool                              { return b.Func.Log() }
 func (b *Block) Fatalf(msg string, args ...interface{}) { b.Func.Fatalf(msg, args...) }
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@ -55,7 +55,7 @@ func Compile(f *Func) {
 	if f.Log() {
 		printFunc(f)
 	}
-	f.HTMLWriter.WriteFunc("start", "start", f)
+	f.HTMLWriter.WritePhase("start", "start")
 	if BuildDump != "" && BuildDump == f.Name {
 		f.dumpFile("build")
 	}
@ -111,7 +111,7 @@ func Compile(f *Func) {
 				f.Logf("  pass %s end %s\n", p.name, stats)
 				printFunc(f)
 			}
-			f.HTMLWriter.WriteFunc(phaseName, fmt.Sprintf("%s <span class=\"stats\">%s</span>", phaseName, stats), f)
+			f.HTMLWriter.WritePhase(phaseName, fmt.Sprintf("%s <span class=\"stats\">%s</span>", phaseName, stats))
 		}
 		if p.time || p.mem {
 			// Surround timing information w/ enough context to allow comparisons.
@ -136,6 +136,11 @@ func Compile(f *Func) {
 		}
 	}

+	if f.HTMLWriter != nil {
+		// Ensure we write any pending phases to the html
+		f.HTMLWriter.flushPhases()
+	}
+
 	if f.ruleMatches != nil {
 		var keys []string
 		for key := range f.ruleMatches {
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@ -249,7 +249,7 @@
 // Medium copying uses a duff device.
 (Move [s] dst src mem)
 	&& s > 8 && s <= 4*128 && s%4 == 0
-	&& !config.noDuffDevice ->
+	&& !config.noDuffDevice && logLargeCopy(v, s) ->
 	(DUFFCOPY [10*(128-s/4)] dst src mem)
 // 10 and 128 are magic constants.  10 is the number of bytes to encode:
 //	MOVL	(SI), CX
@ -259,7 +259,7 @@
 // and 128 is the number of such blocks. See src/runtime/duff_386.s:duffcopy.

 // Large copying uses REP MOVSL.
-(Move [s] dst src mem) && (s > 4*128 || config.noDuffDevice) && s%4 == 0 ->
+(Move [s] dst src mem) && (s > 4*128 || config.noDuffDevice) && s%4 == 0 && logLargeCopy(v, s) ->
 	(REPMOVSL dst src (MOVLconst [s/4]) mem)

 // Lowering Zero instructions
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@ -317,7 +317,7 @@
 // Medium copying uses a duff device.
 (Move [s] dst src mem)
 	&& s > 64 && s <= 16*64 && s%16 == 0
-	&& !config.noDuffDevice ->
+	&& !config.noDuffDevice && logLargeCopy(v, s) ->
 	(DUFFCOPY [14*(64-s/16)] dst src mem)
 // 14 and 64 are magic constants.  14 is the number of bytes to encode:
 //	MOVUPS	(SI), X0
@ -327,7 +327,7 @@
 // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.

 // Large copying uses REP MOVSQ.
-(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 ->
+(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s) ->
 	(REPMOVSQ dst src (MOVQconst [s/8]) mem)

 // Lowering Zero instructions
@ -478,6 +478,7 @@
 (GetClosurePtr ...) -> (LoweredGetClosurePtr ...)
 (GetCallerPC ...) -> (LoweredGetCallerPC ...)
 (GetCallerSP ...) -> (LoweredGetCallerSP ...)
+(HasCPUFeature ...) -> (LoweredHasCPUFeature ...)
 (Addr ...) -> (LEAQ ...)
 (LocalAddr {sym} base _) -> (LEAQ {sym} base)

--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@ -738,6 +738,8 @@ func init() {
 		// It saves all GP registers if necessary, but may clobber others.
 		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), buildReg("AX CX DX BX BP SI R8 R9")}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},

+		{name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "bool", aux: "Sym", symEffect: "None"},
+
 		// There are three of these functions so that they can have three different register inputs.
 		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
 		// default registers to match so we don't need to copy registers around unnecessarily.
--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@ -338,12 +338,12 @@
 // 8 and 128 are magic constants, see runtime/mkduff.go
 (Move [s] {t} dst src mem)
 	&& s%4 == 0 && s > 4 && s <= 512
-	&& t.(*types.Type).Alignment()%4 == 0 && !config.noDuffDevice ->
+	&& t.(*types.Type).Alignment()%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s) ->
 	(DUFFCOPY [8 * (128 - s/4)] dst src mem)

 // Large move uses a loop
 (Move [s] {t} dst src mem)
-	&& (s > 512 || config.noDuffDevice) || t.(*types.Type).Alignment()%4 != 0 ->
+	&& ((s > 512 || config.noDuffDevice) || t.(*types.Type).Alignment()%4 != 0) && logLargeCopy(v, s) ->
 	(LoweredMove [t.(*types.Type).Alignment()]
 		dst
 		src
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@ -470,12 +470,12 @@
 // medium move uses a duff device
 (Move [s] dst src mem)
 	&& s > 32 && s <= 16*64 && s%16 == 8
-	&& !config.noDuffDevice ->
+	&& !config.noDuffDevice && logLargeCopy(v, s) ->
 	(MOVDstore [s-8] dst (MOVDload [s-8] src mem)
 		(DUFFCOPY <types.TypeMem> [8*(64-(s-8)/16)] dst src mem))
 (Move [s] dst src mem)
 	&& s > 32 && s <= 16*64 && s%16 == 0
-	&& !config.noDuffDevice ->
+	&& !config.noDuffDevice && logLargeCopy(v, s) ->
 	(DUFFCOPY [8 * (64 - s/16)] dst src mem)
 // 8 is the number of bytes to encode:
 //
@ -486,7 +486,7 @@

 // large move uses a loop
 (Move [s] dst src mem)
-	&& s > 24 && s%8 == 0 ->
+	&& s > 24 && s%8 == 0 && logLargeCopy(v, s) ->
 	(LoweredMove
 		dst
 		src
--- a/src/cmd/compile/internal/ssa/gen/MIPS.rules
+++ b/src/cmd/compile/internal/ssa/gen/MIPS.rules
@ -325,7 +325,7 @@

 // large or unaligned move uses a loop
 (Move [s] {t} dst src mem)
-	&& (s > 16 || t.(*types.Type).Alignment()%4 != 0) ->
+	&& (s > 16 && logLargeCopy(v, s) || t.(*types.Type).Alignment()%4 != 0) ->
 	(LoweredMove [t.(*types.Type).Alignment()]
 		dst
 		src
--- a/src/cmd/compile/internal/ssa/gen/MIPS64.rules
+++ b/src/cmd/compile/internal/ssa/gen/MIPS64.rules
@ -359,7 +359,7 @@
 // medium move uses a duff device
 (Move [s] {t} dst src mem)
 	&& s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0
-	&& !config.noDuffDevice ->
+	&& !config.noDuffDevice && logLargeCopy(v, s)  ->
 	(DUFFCOPY [16 * (128 - s/8)] dst src mem)
 // 16 and 128 are magic constants.  16 is the number of bytes to encode:
 //	MOVV	(R1), R23
@ -370,7 +370,7 @@

 // large or unaligned move uses a loop
 (Move [s] {t} dst src mem)
-	&& s > 24 || t.(*types.Type).Alignment()%8 != 0 ->
+	&& s > 24 && logLargeCopy(v, s) || t.(*types.Type).Alignment()%8 != 0 ->
 	(LoweredMove [t.(*types.Type).Alignment()]
 		dst
 		src
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@ -574,7 +574,12 @@
                               (MOVDstorezero [0] destptr mem))))

 // Handle cases not handled above
-(Zero [s] ptr mem) -> (LoweredZero [s] ptr mem)
+// Lowered Short cases do not generate loops, and as a result don't clobber
+// the address registers or flags.
+(Zero [s] ptr mem) && objabi.GOPPC64 <= 8 && s < 64 -> (LoweredZeroShort [s] ptr mem)
+(Zero [s] ptr mem) && objabi.GOPPC64 <= 8 -> (LoweredZero [s] ptr mem)
+(Zero [s] ptr mem) && s < 128 && objabi.GOPPC64 >= 9 -> (LoweredQuadZeroShort [s] ptr mem)
+(Zero [s] ptr mem) && objabi.GOPPC64 >= 9 -> (LoweredQuadZero [s] ptr mem)

 // moves
 // Only the MOVD and MOVW instructions require 4 byte
@ -608,8 +613,12 @@

 // Large move uses a loop. Since the address is computed and the
 // offset is zero, any alignment can be used.
-(Move [s] dst src mem) && s > 8 ->
+(Move [s] dst src mem) && s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s) ->
        (LoweredMove [s] dst src mem)
+(Move [s] dst src mem) && s > 8 && s <= 64 && objabi.GOPPC64 >= 9 ->
+        (LoweredQuadMoveShort [s] dst src mem)
+(Move [s] dst src mem) && s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s) ->
+        (LoweredQuadMove [s] dst src mem)

 // Calls
 // Lowering calls
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@ -445,14 +445,49 @@ func init() {
 			aux:       "Int64",
 			argLength: 2,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("R3")},
-				clobbers: buildReg("R3"),
+				inputs:   []regMask{buildReg("R20")},
+				clobbers: buildReg("R20"),
 			},
 			clobberFlags:   true,
 			typ:            "Mem",
 			faultOnNilArg0: true,
 			unsafePoint:    true,
 		},
+		{
+			name:      "LoweredZeroShort",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs: []regMask{gp}},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			unsafePoint:    true,
+		},
+		{
+			name:      "LoweredQuadZeroShort",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs: []regMask{gp},
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			unsafePoint:    true,
+		},
+		{
+			name:      "LoweredQuadZero",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R20")},
+				clobbers: buildReg("R20"),
+			},
+			clobberFlags:   true,
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			unsafePoint:    true,
+		},
+
 		// R31 is temp register
 		// Loop code:
 		//	MOVD len/32,R31		set up loop ctr
@ -491,8 +526,8 @@ func init() {
 			aux:       "Int64",
 			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("R3"), buildReg("R4")},
-				clobbers: buildReg("R3 R4 R14"),
+				inputs:   []regMask{buildReg("R20"), buildReg("R21")},
+				clobbers: buildReg("R20 R21"),
 			},
 			clobberFlags:   true,
 			typ:            "Mem",
@ -500,6 +535,49 @@ func init() {
 			faultOnNilArg1: true,
 			unsafePoint:    true,
 		},
+		{
+			name:      "LoweredMoveShort",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs: []regMask{gp, gp},
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true,
+		},
+
+		// The following is similar to the LoweredMove, but uses
+		// LXV instead of LXVD2X, which does not require an index
+		// register and will do 4 in a loop instead of only.
+		{
+			name:      "LoweredQuadMove",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R20"), buildReg("R21")},
+				clobbers: buildReg("R20 R21"),
+			},
+			clobberFlags:   true,
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true,
+		},
+
+		{
+			name:      "LoweredQuadMoveShort",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs: []regMask{gp, gp},
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true,
+		},

 		{name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
 		{name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@ -386,17 +386,17 @@
 			(MOVWstore dst (MOVWZload src mem) mem)))

 // MVC for other moves. Use up to 4 instructions (sizes up to 1024 bytes).
-(Move [s] dst src mem) && s > 0 && s <= 256 ->
+(Move [s] dst src mem) && s > 0 && s <= 256 && logLargeCopy(v, s) ->
 	(MVC [makeValAndOff(s, 0)] dst src mem)
-(Move [s] dst src mem) && s > 256 && s <= 512 ->
+(Move [s] dst src mem) && s > 256 && s <= 512 && logLargeCopy(v, s) ->
 	(MVC [makeValAndOff(s-256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))
-(Move [s] dst src mem) && s > 512 && s <= 768 ->
+(Move [s] dst src mem) && s > 512 && s <= 768 && logLargeCopy(v, s) ->
 	(MVC [makeValAndOff(s-512, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem)))
-(Move [s] dst src mem) && s > 768 && s <= 1024 ->
+(Move [s] dst src mem) && s > 768 && s <= 1024 && logLargeCopy(v, s) ->
 	(MVC [makeValAndOff(s-768, 768)] dst src (MVC [makeValAndOff(256, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))))

 // Move more than 1024 bytes using a loop.
-(Move [s] dst src mem) && s > 1024 ->
+(Move [s] dst src mem) && s > 1024 && logLargeCopy(v, s) ->
 	(LoweredMove [s%256] dst src (ADD <src.Type> src (MOVDconst [(s/256)*256])) mem)

 // Lowering Zero instructions
@ -421,7 +421,7 @@
 (Zero [s] destptr mem) && s > 0 && s <= 1024 ->
 	(CLEAR [makeValAndOff(s, 0)] destptr mem)

-// Move more than 1024 bytes using a loop.
+// Zero more than 1024 bytes using a loop.
 (Zero [s] destptr mem) && s > 1024 ->
 	(LoweredZero [s%256] destptr (ADDconst <destptr.Type> destptr [(s/256)*256]) mem)

--- a/src/cmd/compile/internal/ssa/gen/Wasm.rules
+++ b/src/cmd/compile/internal/ssa/gen/Wasm.rules
@ -253,7 +253,7 @@
 			(I64Store dst (I64Load src mem) mem)))

 // Large copying uses helper.
-(Move [s] dst src mem) && s%8 == 0 ->
+(Move [s] dst src mem) && s%8 == 0 && logLargeCopy(v, s) ->
 	(LoweredMove [s/8] dst src mem)

 // Lowering Zero instructions
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@ -378,6 +378,8 @@ var genericOps = []opData{
 	// arch-dependent), and is not a safe-point.
 	{name: "WB", argLength: 3, typ: "Mem", aux: "Sym", symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier

+	{name: "HasCPUFeature", argLength: 0, typ: "bool", aux: "Sym", symEffect: "None"}, // aux=place that this feature flag can be loaded from
+
 	// PanicBounds and PanicExtend generate a runtime panic.
 	// Their arguments provide index values to use in panic messages.
 	// Both PanicBounds and PanicExtend have an AuxInt value from the BoundsKind type (in ../op.go).
--- a/src/cmd/compile/internal/ssa/html.go
+++ b/src/cmd/compile/internal/ssa/html.go
@ -18,8 +18,8 @@ import (
 )

 type HTMLWriter struct {
-	Logger
 	w             io.WriteCloser
+	Func          *Func
 	path          string
 	dot           *dotWriter
 	prevHash      []byte
@ -27,22 +27,37 @@ type HTMLWriter struct {
 	pendingTitles []string
 }

-func NewHTMLWriter(path string, logger Logger, funcname, cfgMask string) *HTMLWriter {
+func NewHTMLWriter(path string, f *Func, cfgMask string) *HTMLWriter {
 	out, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
 	if err != nil {
-		logger.Fatalf(src.NoXPos, "%v", err)
+		f.Fatalf("%v", err)
 	}
 	pwd, err := os.Getwd()
 	if err != nil {
-		logger.Fatalf(src.NoXPos, "%v", err)
+		f.Fatalf("%v", err)
 	}
-	html := HTMLWriter{w: out, Logger: logger, path: filepath.Join(pwd, path)}
-	html.dot = newDotWriter(cfgMask)
-	html.start(funcname)
+	html := HTMLWriter{
+		w:    out,
+		Func: f,
+		path: filepath.Join(pwd, path),
+		dot:  newDotWriter(cfgMask),
+	}
+	html.start()
 	return &html
 }

-func (w *HTMLWriter) start(name string) {
+// Fatalf reports an error and exits.
+func (w *HTMLWriter) Fatalf(msg string, args ...interface{}) {
+	fe := w.Func.Frontend()
+	fe.Fatalf(src.NoXPos, msg, args...)
+}
+
+// Logf calls the (w *HTMLWriter).Func's Logf method passing along a msg and args.
+func (w *HTMLWriter) Logf(msg string, args ...interface{}) {
+	w.Func.Logf(msg, args...)
+}
+
+func (w *HTMLWriter) start() {
 	if w == nil {
 		return
 	}
@ -703,7 +718,7 @@ function toggleDarkMode() {
 </head>`)
 	w.WriteString("<body>")
 	w.WriteString("<h1>")
-	w.WriteString(html.EscapeString(name))
+	w.WriteString(html.EscapeString(w.Func.Name))
 	w.WriteString("</h1>")
 	w.WriteString(`
 <a href="#" onclick="toggle_visibility('help');return false;" id="helplink">help</a>
@ -749,24 +764,38 @@ func (w *HTMLWriter) Close() {
 	fmt.Printf("dumped SSA to %v\n", w.path)
 }

-// WriteFunc writes f in a column headed by title.
+// WritePhase writes f in a column headed by title.
 // phase is used for collapsing columns and should be unique across the table.
-func (w *HTMLWriter) WriteFunc(phase, title string, f *Func) {
+func (w *HTMLWriter) WritePhase(phase, title string) {
 	if w == nil {
 		return // avoid generating HTML just to discard it
 	}
-	hash := hashFunc(f)
+	hash := hashFunc(w.Func)
 	w.pendingPhases = append(w.pendingPhases, phase)
 	w.pendingTitles = append(w.pendingTitles, title)
 	if !bytes.Equal(hash, w.prevHash) {
-		phases := strings.Join(w.pendingPhases, "  +  ")
-		w.WriteMultiTitleColumn(phases, w.pendingTitles, fmt.Sprintf("hash-%x", hash), f.HTML(phase, w.dot))
-		w.pendingPhases = w.pendingPhases[:0]
-		w.pendingTitles = w.pendingTitles[:0]
+		w.flushPhases()
 	}
 	w.prevHash = hash
 }

+// flushPhases collects any pending phases and titles, writes them to the html, and resets the pending slices.
+func (w *HTMLWriter) flushPhases() {
+	phaseLen := len(w.pendingPhases)
+	if phaseLen == 0 {
+		return
+	}
+	phases := strings.Join(w.pendingPhases, "  +  ")
+	w.WriteMultiTitleColumn(
+		phases,
+		w.pendingTitles,
+		fmt.Sprintf("hash-%x", w.prevHash),
+		w.Func.HTML(w.pendingPhases[phaseLen-1], w.dot),
+	)
+	w.pendingPhases = w.pendingPhases[:0]
+	w.pendingTitles = w.pendingTitles[:0]
+}
+
 // FuncLines contains source code for a function to be displayed
 // in sources column.
 type FuncLines struct {
@ -903,13 +932,13 @@ func (w *HTMLWriter) WriteMultiTitleColumn(phase string, titles []string, class,

 func (w *HTMLWriter) Printf(msg string, v ...interface{}) {
 	if _, err := fmt.Fprintf(w.w, msg, v...); err != nil {
-		w.Fatalf(src.NoXPos, "%v", err)
+		w.Fatalf("%v", err)
 	}
 }

 func (w *HTMLWriter) WriteString(s string) {
 	if _, err := io.WriteString(w.w, s); err != nil {
-		w.Fatalf(src.NoXPos, "%v", err)
+		w.Fatalf("%v", err)
 	}
 }

@ -976,6 +1005,9 @@ func (b *Block) LongHTML() string {
 	if b.Aux != nil {
 		s += html.EscapeString(fmt.Sprintf(" {%v}", b.Aux))
 	}
+	if t := b.AuxIntString(); t != "" {
+		s += html.EscapeString(fmt.Sprintf(" [%v]", t))
+	}
 	for _, c := range b.ControlValues() {
 		s += fmt.Sprintf(" %s", c.HTML())
 	}
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -885,6 +885,7 @@ const (
 	OpAMD64LoweredGetCallerSP
 	OpAMD64LoweredNilCheck
 	OpAMD64LoweredWB
+	OpAMD64LoweredHasCPUFeature
 	OpAMD64LoweredPanicBoundsA
 	OpAMD64LoweredPanicBoundsB
 	OpAMD64LoweredPanicBoundsC
@ -1871,7 +1872,13 @@ const (
 	OpPPC64CALLclosure
 	OpPPC64CALLinter
 	OpPPC64LoweredZero
+	OpPPC64LoweredZeroShort
+	OpPPC64LoweredQuadZeroShort
+	OpPPC64LoweredQuadZero
 	OpPPC64LoweredMove
+	OpPPC64LoweredMoveShort
+	OpPPC64LoweredQuadMove
+	OpPPC64LoweredQuadMoveShort
 	OpPPC64LoweredAtomicStore8
 	OpPPC64LoweredAtomicStore32
 	OpPPC64LoweredAtomicStore64
@ -2596,6 +2603,7 @@ const (
 	OpMoveWB
 	OpZeroWB
 	OpWB
+	OpHasCPUFeature
 	OpPanicBounds
 	OpPanicExtend
 	OpClosureCall
@ -11650,6 +11658,18 @@ var opcodeTable = [...]opInfo{
 			clobbers: 4294901760, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
 		},
 	},
+	{
+		name:              "LoweredHasCPUFeature",
+		auxType:           auxSym,
+		argLen:            0,
+		rematerializeable: true,
+		symEffect:         SymNone,
+		reg: regInfo{
+			outputs: []outputInfo{
+				{0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
 	{
 		name:    "LoweredPanicBoundsA",
 		auxType: auxInt64,
@ -24851,9 +24871,47 @@ var opcodeTable = [...]opInfo{
 		unsafePoint:    true,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 8}, // R3
+				{0, 1048576}, // R20
 			},
-			clobbers: 8, // R3
+			clobbers: 1048576, // R20
+		},
+	},
+	{
+		name:           "LoweredZeroShort",
+		auxType:        auxInt64,
+		argLen:         2,
+		faultOnNilArg0: true,
+		unsafePoint:    true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:           "LoweredQuadZeroShort",
+		auxType:        auxInt64,
+		argLen:         2,
+		faultOnNilArg0: true,
+		unsafePoint:    true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:           "LoweredQuadZero",
+		auxType:        auxInt64,
+		argLen:         2,
+		clobberFlags:   true,
+		faultOnNilArg0: true,
+		unsafePoint:    true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1048576}, // R20
+			},
+			clobbers: 1048576, // R20
 		},
 	},
 	{
@ -24866,10 +24924,54 @@ var opcodeTable = [...]opInfo{
 		unsafePoint:    true,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 8},  // R3
-				{1, 16}, // R4
+				{0, 1048576}, // R20
+				{1, 2097152}, // R21
+			},
+			clobbers: 3145728, // R20 R21
+		},
+	},
+	{
+		name:           "LoweredMoveShort",
+		auxType:        auxInt64,
+		argLen:         3,
+		faultOnNilArg0: true,
+		faultOnNilArg1: true,
+		unsafePoint:    true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+				{1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:           "LoweredQuadMove",
+		auxType:        auxInt64,
+		argLen:         3,
+		clobberFlags:   true,
+		faultOnNilArg0: true,
+		faultOnNilArg1: true,
+		unsafePoint:    true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1048576}, // R20
+				{1, 2097152}, // R21
+			},
+			clobbers: 3145728, // R20 R21
+		},
+	},
+	{
+		name:           "LoweredQuadMoveShort",
+		auxType:        auxInt64,
+		argLen:         3,
+		faultOnNilArg0: true,
+		faultOnNilArg1: true,
+		unsafePoint:    true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+				{1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
 			},
-			clobbers: 16408, // R3 R4 R14
 		},
 	},
 	{
@ -32979,6 +33081,13 @@ var opcodeTable = [...]opInfo{
 		symEffect: SymNone,
 		generic:   true,
 	},
+	{
+		name:      "HasCPUFeature",
+		auxType:   auxSym,
+		argLen:    0,
+		symEffect: SymNone,
+		generic:   true,
+	},
 	{
 		name:    "PanicBounds",
 		auxType: auxInt64,
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@ -5,6 +5,7 @@
 package ssa

 import (
+	"cmd/compile/internal/logopt"
 	"cmd/compile/internal/types"
 	"cmd/internal/obj"
 	"cmd/internal/objabi"
@ -1074,9 +1075,9 @@ func isInlinableMemmove(dst, src *Value, sz int64, c *Config) bool {
 	switch c.arch {
 	case "amd64":
 		return sz <= 16 || (sz < 1024 && disjoint(dst, sz, src, sz))
-	case "386", "ppc64", "ppc64le", "arm64":
+	case "386", "arm64":
 		return sz <= 8
-	case "s390x":
+	case "s390x", "ppc64", "ppc64le":
 		return sz <= 8 || disjoint(dst, sz, src, sz)
 	case "arm", "mips", "mips64", "mipsle", "mips64le":
 		return sz <= 4
@ -1084,6 +1085,19 @@ func isInlinableMemmove(dst, src *Value, sz int64, c *Config) bool {
 	return false
 }

+// logLargeCopy logs the occurrence of a large copy.
+// The best place to do this is in the rewrite rules where the size of the move is easy to find.
+// "Large" is arbitrarily chosen to be 128 bytes; this may change.
+func logLargeCopy(v *Value, s int64) bool {
+	if s < 128 {
+		return true
+	}
+	if logopt.Enabled() {
+		logopt.LogOpt(v.Pos, "copy", "lower", v.Block.Func.Name, fmt.Sprintf("%d bytes", s))
+	}
+	return true
+}
+
 // hasSmallRotate reports whether the architecture has rotate instructions
 // for sizes < 32-bit.  This is used to decide whether to promote some rotations.
 func hasSmallRotate(c *Config) bool {
--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
@ -10046,14 +10046,14 @@ func rewriteValue386_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 8 && s <= 4*128 && s%4 == 0 && !config.noDuffDevice
+	// cond: s > 8 && s <= 4*128 && s%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s)
 	// result: (DUFFCOPY [10*(128-s/4)] dst src mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 8 && s <= 4*128 && s%4 == 0 && !config.noDuffDevice) {
+		if !(s > 8 && s <= 4*128 && s%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(Op386DUFFCOPY)
@ -10062,14 +10062,14 @@ func rewriteValue386_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: (s > 4*128 || config.noDuffDevice) && s%4 == 0
+	// cond: (s > 4*128 || config.noDuffDevice) && s%4 == 0 && logLargeCopy(v, s)
 	// result: (REPMOVSL dst src (MOVLconst [s/4]) mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !((s > 4*128 || config.noDuffDevice) && s%4 == 0) {
+		if !((s > 4*128 || config.noDuffDevice) && s%4 == 0 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(Op386REPMOVSL)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@ -786,6 +786,9 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpGreater32F(v)
 	case OpGreater64F:
 		return rewriteValueAMD64_OpGreater64F(v)
+	case OpHasCPUFeature:
+		v.Op = OpAMD64LoweredHasCPUFeature
+		return true
 	case OpHmul32:
 		v.Op = OpAMD64HMULL
 		return true
@ -31632,14 +31635,14 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 64 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice
+	// cond: s > 64 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice && logLargeCopy(v, s)
 	// result: (DUFFCOPY [14*(64-s/16)] dst src mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 64 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice) {
+		if !(s > 64 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpAMD64DUFFCOPY)
@ -31648,14 +31651,14 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: (s > 16*64 || config.noDuffDevice) && s%8 == 0
+	// cond: (s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s)
 	// result: (REPMOVSQ dst src (MOVQconst [s/8]) mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !((s > 16*64 || config.noDuffDevice) && s%8 == 0) {
+		if !((s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpAMD64REPMOVSQ)
--- a/src/cmd/compile/internal/ssa/rewriteARM.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM.go
@ -15228,7 +15228,7 @@ func rewriteValueARM_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] {t} dst src mem)
-	// cond: s%4 == 0 && s > 4 && s <= 512 && t.(*types.Type).Alignment()%4 == 0 && !config.noDuffDevice
+	// cond: s%4 == 0 && s > 4 && s <= 512 && t.(*types.Type).Alignment()%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s)
 	// result: (DUFFCOPY [8 * (128 - s/4)] dst src mem)
 	for {
 		s := v.AuxInt
@ -15236,7 +15236,7 @@ func rewriteValueARM_OpMove(v *Value) bool {
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%4 == 0 && s > 4 && s <= 512 && t.(*types.Type).Alignment()%4 == 0 && !config.noDuffDevice) {
+		if !(s%4 == 0 && s > 4 && s <= 512 && t.(*types.Type).Alignment()%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpARMDUFFCOPY)
@ -15245,7 +15245,7 @@ func rewriteValueARM_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] {t} dst src mem)
-	// cond: (s > 512 || config.noDuffDevice) || t.(*types.Type).Alignment()%4 != 0
+	// cond: ((s > 512 || config.noDuffDevice) || t.(*types.Type).Alignment()%4 != 0) && logLargeCopy(v, s)
 	// result: (LoweredMove [t.(*types.Type).Alignment()] dst src (ADDconst <src.Type> src [s-moveSize(t.(*types.Type).Alignment(), config)]) mem)
 	for {
 		s := v.AuxInt
@ -15253,7 +15253,7 @@ func rewriteValueARM_OpMove(v *Value) bool {
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !((s > 512 || config.noDuffDevice) || t.(*types.Type).Alignment()%4 != 0) {
+		if !(((s > 512 || config.noDuffDevice) || t.(*types.Type).Alignment()%4 != 0) && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpARMLoweredMove)
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@ -23742,14 +23742,14 @@ func rewriteValueARM64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice
+	// cond: s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice && logLargeCopy(v, s)
 	// result: (MOVDstore [s-8] dst (MOVDload [s-8] src mem) (DUFFCOPY <types.TypeMem> [8*(64-(s-8)/16)] dst src mem))
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice) {
+		if !(s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpARM64MOVDstore)
@ -23764,14 +23764,14 @@ func rewriteValueARM64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 32 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice
+	// cond: s > 32 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice && logLargeCopy(v, s)
 	// result: (DUFFCOPY [8 * (64 - s/16)] dst src mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 32 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice) {
+		if !(s > 32 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpARM64DUFFCOPY)
@ -23780,14 +23780,14 @@ func rewriteValueARM64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 24 && s%8 == 0
+	// cond: s > 24 && s%8 == 0 && logLargeCopy(v, s)
 	// result: (LoweredMove dst src (ADDconst <src.Type> src [s-8]) mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 24 && s%8 == 0) {
+		if !(s > 24 && s%8 == 0 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpARM64LoweredMove)
--- a/src/cmd/compile/internal/ssa/rewriteMIPS.go
+++ b/src/cmd/compile/internal/ssa/rewriteMIPS.go
@ -5263,7 +5263,7 @@ func rewriteValueMIPS_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] {t} dst src mem)
-	// cond: (s > 16 || t.(*types.Type).Alignment()%4 != 0)
+	// cond: (s > 16 && logLargeCopy(v, s) || t.(*types.Type).Alignment()%4 != 0)
 	// result: (LoweredMove [t.(*types.Type).Alignment()] dst src (ADDconst <src.Type> src [s-moveSize(t.(*types.Type).Alignment(), config)]) mem)
 	for {
 		s := v.AuxInt
@ -5271,7 +5271,7 @@ func rewriteValueMIPS_OpMove(v *Value) bool {
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 16 || t.(*types.Type).Alignment()%4 != 0) {
+		if !(s > 16 && logLargeCopy(v, s) || t.(*types.Type).Alignment()%4 != 0) {
 			break
 		}
 		v.reset(OpMIPSLoweredMove)
--- a/src/cmd/compile/internal/ssa/rewriteMIPS64.go
+++ b/src/cmd/compile/internal/ssa/rewriteMIPS64.go
@ -5533,7 +5533,7 @@ func rewriteValueMIPS64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] {t} dst src mem)
-	// cond: s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0 && !config.noDuffDevice
+	// cond: s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0 && !config.noDuffDevice && logLargeCopy(v, s)
 	// result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
 	for {
 		s := v.AuxInt
@ -5541,7 +5541,7 @@ func rewriteValueMIPS64_OpMove(v *Value) bool {
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0 && !config.noDuffDevice) {
+		if !(s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0 && !config.noDuffDevice && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpMIPS64DUFFCOPY)
@ -5550,7 +5550,7 @@ func rewriteValueMIPS64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] {t} dst src mem)
-	// cond: s > 24 || t.(*types.Type).Alignment()%8 != 0
+	// cond: s > 24 && logLargeCopy(v, s) || t.(*types.Type).Alignment()%8 != 0
 	// result: (LoweredMove [t.(*types.Type).Alignment()] dst src (ADDVconst <src.Type> src [s-moveSize(t.(*types.Type).Alignment(), config)]) mem)
 	for {
 		s := v.AuxInt
@ -5558,7 +5558,7 @@ func rewriteValueMIPS64_OpMove(v *Value) bool {
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 24 || t.(*types.Type).Alignment()%8 != 0) {
+		if !(s > 24 && logLargeCopy(v, s) || t.(*types.Type).Alignment()%8 != 0) {
 			break
 		}
 		v.reset(OpMIPS64LoweredMove)
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@ -3486,14 +3486,14 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 8
+	// cond: s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s)
 	// result: (LoweredMove [s] dst src mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 8) {
+		if !(s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpPPC64LoweredMove)
@ -3501,6 +3501,38 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
 		v.AddArg3(dst, src, mem)
 		return true
 	}
+	// match: (Move [s] dst src mem)
+	// cond: s > 8 && s <= 64 && objabi.GOPPC64 >= 9
+	// result: (LoweredQuadMoveShort [s] dst src mem)
+	for {
+		s := v.AuxInt
+		dst := v_0
+		src := v_1
+		mem := v_2
+		if !(s > 8 && s <= 64 && objabi.GOPPC64 >= 9) {
+			break
+		}
+		v.reset(OpPPC64LoweredQuadMoveShort)
+		v.AuxInt = s
+		v.AddArg3(dst, src, mem)
+		return true
+	}
+	// match: (Move [s] dst src mem)
+	// cond: s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s)
+	// result: (LoweredQuadMove [s] dst src mem)
+	for {
+		s := v.AuxInt
+		dst := v_0
+		src := v_1
+		mem := v_2
+		if !(s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s)) {
+			break
+		}
+		v.reset(OpPPC64LoweredQuadMove)
+		v.AuxInt = s
+		v.AddArg3(dst, src, mem)
+		return true
+	}
 	return false
 }
 func rewriteValuePPC64_OpNeq16(v *Value) bool {
@ -14953,16 +14985,66 @@ func rewriteValuePPC64_OpZero(v *Value) bool {
 		return true
 	}
 	// match: (Zero [s] ptr mem)
+	// cond: objabi.GOPPC64 <= 8 && s < 64
+	// result: (LoweredZeroShort [s] ptr mem)
+	for {
+		s := v.AuxInt
+		ptr := v_0
+		mem := v_1
+		if !(objabi.GOPPC64 <= 8 && s < 64) {
+			break
+		}
+		v.reset(OpPPC64LoweredZeroShort)
+		v.AuxInt = s
+		v.AddArg2(ptr, mem)
+		return true
+	}
+	// match: (Zero [s] ptr mem)
+	// cond: objabi.GOPPC64 <= 8
 	// result: (LoweredZero [s] ptr mem)
 	for {
 		s := v.AuxInt
 		ptr := v_0
 		mem := v_1
+		if !(objabi.GOPPC64 <= 8) {
+			break
+		}
 		v.reset(OpPPC64LoweredZero)
 		v.AuxInt = s
 		v.AddArg2(ptr, mem)
 		return true
 	}
+	// match: (Zero [s] ptr mem)
+	// cond: s < 128 && objabi.GOPPC64 >= 9
+	// result: (LoweredQuadZeroShort [s] ptr mem)
+	for {
+		s := v.AuxInt
+		ptr := v_0
+		mem := v_1
+		if !(s < 128 && objabi.GOPPC64 >= 9) {
+			break
+		}
+		v.reset(OpPPC64LoweredQuadZeroShort)
+		v.AuxInt = s
+		v.AddArg2(ptr, mem)
+		return true
+	}
+	// match: (Zero [s] ptr mem)
+	// cond: objabi.GOPPC64 >= 9
+	// result: (LoweredQuadZero [s] ptr mem)
+	for {
+		s := v.AuxInt
+		ptr := v_0
+		mem := v_1
+		if !(objabi.GOPPC64 >= 9) {
+			break
+		}
+		v.reset(OpPPC64LoweredQuadZero)
+		v.AuxInt = s
+		v.AddArg2(ptr, mem)
+		return true
+	}
+	return false
 }
 func rewriteBlockPPC64(b *Block) bool {
 	switch b.Kind {
--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
@ -3303,14 +3303,14 @@ func rewriteValueS390X_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 0 && s <= 256
+	// cond: s > 0 && s <= 256 && logLargeCopy(v, s)
 	// result: (MVC [makeValAndOff(s, 0)] dst src mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 0 && s <= 256) {
+		if !(s > 0 && s <= 256 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpS390XMVC)
@ -3319,14 +3319,14 @@ func rewriteValueS390X_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 256 && s <= 512
+	// cond: s > 256 && s <= 512 && logLargeCopy(v, s)
 	// result: (MVC [makeValAndOff(s-256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 256 && s <= 512) {
+		if !(s > 256 && s <= 512 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpS390XMVC)
@ -3338,14 +3338,14 @@ func rewriteValueS390X_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 512 && s <= 768
+	// cond: s > 512 && s <= 768 && logLargeCopy(v, s)
 	// result: (MVC [makeValAndOff(s-512, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem)))
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 512 && s <= 768) {
+		if !(s > 512 && s <= 768 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpS390XMVC)
@ -3360,14 +3360,14 @@ func rewriteValueS390X_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 768 && s <= 1024
+	// cond: s > 768 && s <= 1024 && logLargeCopy(v, s)
 	// result: (MVC [makeValAndOff(s-768, 768)] dst src (MVC [makeValAndOff(256, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))))
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 768 && s <= 1024) {
+		if !(s > 768 && s <= 1024 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpS390XMVC)
@ -3385,14 +3385,14 @@ func rewriteValueS390X_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s > 1024
+	// cond: s > 1024 && logLargeCopy(v, s)
 	// result: (LoweredMove [s%256] dst src (ADD <src.Type> src (MOVDconst [(s/256)*256])) mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s > 1024) {
+		if !(s > 1024 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpS390XLoweredMove)
--- a/src/cmd/compile/internal/ssa/rewriteWasm.go
+++ b/src/cmd/compile/internal/ssa/rewriteWasm.go
@ -2104,14 +2104,14 @@ func rewriteValueWasm_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s%8 == 0
+	// cond: s%8 == 0 && logLargeCopy(v, s)
 	// result: (LoweredMove [s/8] dst src mem)
 	for {
 		s := v.AuxInt
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%8 == 0) {
+		if !(s%8 == 0 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpWasmLoweredMove)
--- a/src/cmd/dist/build.go
+++ b/src/cmd/dist/build.go
@ -1515,9 +1515,7 @@ func checkNotStale(goBinary string, targets ...string) {
 // by 'go tool dist list'.
 var cgoEnabled = map[string]bool{
 	"aix/ppc64":       true,
-	"darwin/386":      false, // Issue 31751
 	"darwin/amd64":    true,
-	"darwin/arm":      true,
 	"darwin/arm64":    true,
 	"dragonfly/amd64": true,
 	"freebsd/386":     true,
--- a/src/cmd/go/go_test.go
+++ b/src/cmd/go/go_test.go
@ -1946,9 +1946,9 @@ func TestGenerateUsesBuildContext(t *testing.T) {
 	tg.grepStdout("linux amd64", "unexpected GOOS/GOARCH combination")

 	tg.setenv("GOOS", "darwin")
-	tg.setenv("GOARCH", "386")
+	tg.setenv("GOARCH", "arm64")
 	tg.run("generate", "gen")
-	tg.grepStdout("darwin 386", "unexpected GOOS/GOARCH combination")
+	tg.grepStdout("darwin arm64", "unexpected GOOS/GOARCH combination")
 }

 func TestGoEnv(t *testing.T) {
--- a/src/cmd/go/internal/modload/mvs.go
+++ b/src/cmd/go/internal/modload/mvs.go
@ -148,7 +148,7 @@ func (r *mvsReqs) required(mod module.Version) ([]module.Version, error) {
 	if mpath := f.Module.Mod.Path; mpath != origPath && mpath != mod.Path {
 		return nil, module.VersionError(mod, fmt.Errorf(`parsing go.mod:
 	module declares its path as: %s
-	        but was required as: %s`, mpath, mod.Path))
+	        but was required as: %s`, mpath, origPath))
 	}
 	if f.Go != nil {
 		r.versions.LoadOrStore(mod, f.Go.Version)
--- a/src/cmd/go/testdata/mod/example.com_quote_v1.5.2.txt
+++ b/src/cmd/go/testdata/mod/example.com_quote_v1.5.2.txt
@ -0,0 +1,9 @@
+This module is a replacement for rsc.io/quote, but its go.mod file declares
+a module path different from its location and the original module.
+
+-- .mod --
+module rsc.io/Quote
+
+go 1.14
+-- .info --
+{"Version":"v1.5.2"}
--- a/src/cmd/go/testdata/script/mod_load_replace_mismatch.txt
+++ b/src/cmd/go/testdata/script/mod_load_replace_mismatch.txt
@ -0,0 +1,23 @@
+# If a replacement module declares a module path different from both
+# the original module and its location, report an error with all three paths.
+# In particular, the "required as" path should be the original.
+# Verifies golang.org/issue/38220.
+! go list .
+cmp stderr want
+
+-- go.mod --
+module m
+
+require rsc.io/quote v1.5.2
+
+replace rsc.io/quote v1.5.2 => example.com/quote v1.5.2
+
+-- use.go --
+package use
+
+import _ "rsc.io/quote"
+
+-- want --
+go: example.com/quote@v1.5.2: parsing go.mod:
+	module declares its path as: rsc.io/Quote
+	        but was required as: rsc.io/quote
--- a/src/cmd/internal/obj/mips/asm0.go
+++ b/src/cmd/internal/obj/mips/asm0.go
@ -1355,10 +1355,12 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			r = int(o.param)
 		}
 		o1 = OP_RRR(c.oprrr(p.As), uint32(0), uint32(p.To.Reg), uint32(r))
-		rel := obj.Addrel(c.cursym)
-		rel.Off = int32(c.pc)
-		rel.Siz = 0
-		rel.Type = objabi.R_CALLIND
+		if p.As == obj.ACALL {
+			rel := obj.Addrel(c.cursym)
+			rel.Off = int32(c.pc)
+			rel.Siz = 0
+			rel.Type = objabi.R_CALLIND
+		}

 	case 19: /* mov $lcon,r ==> lu+or */
 		// NOTE: this case does not use REGTMP. If it ever does,
--- a/src/cmd/link/internal/ld/lib.go
+++ b/src/cmd/link/internal/ld/lib.go
@ -2361,6 +2361,7 @@ func (sc *stkChk) check(up *chain, depth int) int {
 	relocs := ldr.Relocs(s)
 	var ch1 chain
 	pcsp := obj.NewPCIter(uint32(ctxt.Arch.MinLC))
+	ri := 0
 	for pcsp.Init(info.Pcsp()); !pcsp.Done; pcsp.Next() {
 		// pcsp.value is in effect for [pcsp.pc, pcsp.nextpc).

@ -2371,8 +2372,8 @@ func (sc *stkChk) check(up *chain, depth int) int {
 		}

 		// Process calls in this span.
-		for i := 0; i < relocs.Count(); i++ {
-			r := relocs.At2(i)
+		for ; ri < relocs.Count(); ri++ {
+			r := relocs.At2(ri)
 			if uint32(r.Off()) >= pcsp.NextPC {
 				break
 			}
--- a/src/cmd/link/internal/ld/outbuf_test.go
+++ b/src/cmd/link/internal/ld/outbuf_test.go
@ -5,7 +5,9 @@
 package ld

 import (
+	"io/ioutil"
 	"os"
+	"path/filepath"
 	"runtime"
 	"testing"
 )
@ -17,12 +19,16 @@ func TestMMap(t *testing.T) {
 		t.Skip("unsupported OS")
 	case "darwin", "dragonfly", "freebsd", "linux", "openbsd", "windows":
 	}
-	filename := "foo.out"
+	dir, err := ioutil.TempDir("", "TestMMap")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(dir)
+	filename := filepath.Join(dir, "foo.out")
 	ob := NewOutBuf(nil)
 	if err := ob.Open(filename); err != nil {
-		t.Errorf("error opening file: %v", err)
+		t.Fatalf("error opening file: %v", err)
 	}
-	defer os.RemoveAll(filename)
 	defer ob.Close()
 	if err := ob.Mmap(1 << 20); err != nil {
 		t.Errorf("error mmapping file %v", err)
--- a/src/crypto/rsa/pss.go
+++ b/src/crypto/rsa/pss.go
@ -4,9 +4,7 @@

 package rsa

-// This file implements the PSS signature scheme [1].
-//
-// [1] https://www.emc.com/collateral/white-papers/h11300-pkcs-1v2-2-rsa-cryptography-standard-wp.pdf
+// This file implements the RSASSA-PSS signature scheme according to RFC 8017.

 import (
 	"bytes"
@ -17,8 +15,22 @@ import (
 	"math/big"
 )

+// Per RFC 8017, Section 9.1
+//
+//     EM = MGF1 xor DB || H( 8*0x00 || mHash || salt ) || 0xbc
+//
+// where
+//
+//     DB = PS || 0x01 || salt
+//
+// and PS can be empty so
+//
+//     emLen = dbLen + hLen + 1 = psLen + sLen + hLen + 2
+//
+
 func emsaPSSEncode(mHash []byte, emBits int, salt []byte, hash hash.Hash) ([]byte, error) {
-	// See [1], section 9.1.1
+	// See RFC 8017, Section 9.1.1.
+
 	hLen := hash.Size()
 	sLen := len(salt)
 	emLen := (emBits + 7) / 8
@ -30,7 +42,7 @@ func emsaPSSEncode(mHash []byte, emBits int, salt []byte, hash hash.Hash) ([]byt
 	// 2.  Let mHash = Hash(M), an octet string of length hLen.

 	if len(mHash) != hLen {
-		return nil, errors.New("crypto/rsa: input must be hashed message")
+		return nil, errors.New("crypto/rsa: input must be hashed with given hash")
 	}

 	// 3.  If emLen < hLen + sLen + 2, output "encoding error" and stop.
@ -40,8 +52,9 @@ func emsaPSSEncode(mHash []byte, emBits int, salt []byte, hash hash.Hash) ([]byt
 	}

 	em := make([]byte, emLen)
-	db := em[:emLen-sLen-hLen-2+1+sLen]
-	h := em[emLen-sLen-hLen-2+1+sLen : emLen-1]
+	psLen := emLen - sLen - hLen - 2
+	db := em[:psLen+1+sLen]
+	h := em[psLen+1+sLen : emLen-1]

 	// 4.  Generate a random octet string salt of length sLen; if sLen = 0,
 	//     then salt is the empty string.
@ -69,8 +82,8 @@ func emsaPSSEncode(mHash []byte, emBits int, salt []byte, hash hash.Hash) ([]byt
 	// 8.  Let DB = PS || 0x01 || salt; DB is an octet string of length
 	//     emLen - hLen - 1.

-	db[emLen-sLen-hLen-2] = 0x01
-	copy(db[emLen-sLen-hLen-1:], salt)
+	db[psLen] = 0x01
+	copy(db[psLen+1:], salt)

 	// 9.  Let dbMask = MGF(H, emLen - hLen - 1).
 	//
@ -81,47 +94,57 @@ func emsaPSSEncode(mHash []byte, emBits int, salt []byte, hash hash.Hash) ([]byt
 	// 11. Set the leftmost 8 * emLen - emBits bits of the leftmost octet in
 	//     maskedDB to zero.

-	db[0] &= (0xFF >> uint(8*emLen-emBits))
+	db[0] &= 0xff >> (8*emLen - emBits)

 	// 12. Let EM = maskedDB || H || 0xbc.
-	em[emLen-1] = 0xBC
+	em[emLen-1] = 0xbc

 	// 13. Output EM.
 	return em, nil
 }

 func emsaPSSVerify(mHash, em []byte, emBits, sLen int, hash hash.Hash) error {
+	// See RFC 8017, Section 9.1.2.
+
+	hLen := hash.Size()
+	if sLen == PSSSaltLengthEqualsHash {
+		sLen = hLen
+	}
+	emLen := (emBits + 7) / 8
+	if emLen != len(em) {
+		return errors.New("rsa: internal error: inconsistent length")
+	}
+
 	// 1.  If the length of M is greater than the input limitation for the
 	//     hash function (2^61 - 1 octets for SHA-1), output "inconsistent"
 	//     and stop.
 	//
 	// 2.  Let mHash = Hash(M), an octet string of length hLen.
-	hLen := hash.Size()
 	if hLen != len(mHash) {
 		return ErrVerification
 	}

 	// 3.  If emLen < hLen + sLen + 2, output "inconsistent" and stop.
-	emLen := (emBits + 7) / 8
 	if emLen < hLen+sLen+2 {
 		return ErrVerification
 	}

 	// 4.  If the rightmost octet of EM does not have hexadecimal value
 	//     0xbc, output "inconsistent" and stop.
-	if em[len(em)-1] != 0xBC {
+	if em[emLen-1] != 0xbc {
 		return ErrVerification
 	}

 	// 5.  Let maskedDB be the leftmost emLen - hLen - 1 octets of EM, and
 	//     let H be the next hLen octets.
 	db := em[:emLen-hLen-1]
-	h := em[emLen-hLen-1 : len(em)-1]
+	h := em[emLen-hLen-1 : emLen-1]

 	// 6.  If the leftmost 8 * emLen - emBits bits of the leftmost octet in
 	//     maskedDB are not all equal to zero, output "inconsistent" and
 	//     stop.
-	if em[0]&(0xFF<<uint(8-(8*emLen-emBits))) != 0 {
+	var bitMask byte = 0xff >> (8*emLen - emBits)
+	if em[0] & ^bitMask != 0 {
 		return ErrVerification
 	}

@ -132,37 +155,30 @@ func emsaPSSVerify(mHash, em []byte, emBits, sLen int, hash hash.Hash) error {

 	// 9.  Set the leftmost 8 * emLen - emBits bits of the leftmost octet in DB
 	//     to zero.
-	db[0] &= (0xFF >> uint(8*emLen-emBits))
+	db[0] &= bitMask

+	// If we don't know the salt length, look for the 0x01 delimiter.
 	if sLen == PSSSaltLengthAuto {
-	FindSaltLength:
-		for sLen = emLen - (hLen + 2); sLen >= 0; sLen-- {
-			switch db[emLen-hLen-sLen-2] {
-			case 1:
-				break FindSaltLength
-			case 0:
-				continue
-			default:
-				return ErrVerification
-			}
-		}
-		if sLen < 0 {
+		psLen := bytes.IndexByte(db, 0x01)
+		if psLen < 0 {
 			return ErrVerification
 		}
-	} else {
-		// 10. If the emLen - hLen - sLen - 2 leftmost octets of DB are not zero
-		//     or if the octet at position emLen - hLen - sLen - 1 (the leftmost
-		//     position is "position 1") does not have hexadecimal value 0x01,
-		//     output "inconsistent" and stop.
-		for _, e := range db[:emLen-hLen-sLen-2] {
-			if e != 0x00 {
-				return ErrVerification
-			}
-		}
-		if db[emLen-hLen-sLen-2] != 0x01 {
+		sLen = len(db) - psLen - 1
+	}
+
+	// 10. If the emLen - hLen - sLen - 2 leftmost octets of DB are not zero
+	//     or if the octet at position emLen - hLen - sLen - 1 (the leftmost
+	//     position is "position 1") does not have hexadecimal value 0x01,
+	//     output "inconsistent" and stop.
+	psLen := emLen - hLen - sLen - 2
+	for _, e := range db[:psLen] {
+		if e != 0x00 {
 			return ErrVerification
 		}
 	}
+	if db[psLen] != 0x01 {
+		return ErrVerification
+	}

 	// 11.  Let salt be the last sLen octets of DB.
 	salt := db[len(db)-sLen:]
@ -181,19 +197,19 @@ func emsaPSSVerify(mHash, em []byte, emBits, sLen int, hash hash.Hash) error {
 	h0 := hash.Sum(nil)

 	// 14. If H = H', output "consistent." Otherwise, output "inconsistent."
-	if !bytes.Equal(h0, h) {
+	if !bytes.Equal(h0, h) { // TODO: constant time?
 		return ErrVerification
 	}
 	return nil
 }

-// signPSSWithSalt calculates the signature of hashed using PSS [1] with specified salt.
+// signPSSWithSalt calculates the signature of hashed using PSS with specified salt.
 // Note that hashed must be the result of hashing the input message using the
 // given hash function. salt is a random sequence of bytes whose length will be
 // later used to verify the signature.
 func signPSSWithSalt(rand io.Reader, priv *PrivateKey, hash crypto.Hash, hashed, salt []byte) (s []byte, err error) {
-	nBits := priv.N.BitLen()
-	em, err := emsaPSSEncode(hashed, nBits-1, salt, hash.New())
+	emBits := priv.N.BitLen() - 1
+	em, err := emsaPSSEncode(hashed, emBits, salt, hash.New())
 	if err != nil {
 		return
 	}
@ -202,7 +218,7 @@ func signPSSWithSalt(rand io.Reader, priv *PrivateKey, hash crypto.Hash, hashed,
 	if err != nil {
 		return
 	}
-	s = make([]byte, (nBits+7)/8)
+	s = make([]byte, priv.Size())
 	copyWithLeftPad(s, c.Bytes())
 	return
 }
@ -223,16 +239,15 @@ type PSSOptions struct {
 	// PSSSaltLength constants.
 	SaltLength int

-	// Hash, if not zero, overrides the hash function passed to SignPSS.
-	// This is the only way to specify the hash function when using the
-	// crypto.Signer interface.
+	// Hash is the hash function used to generate the message digest. If not
+	// zero, it overrides the hash function passed to SignPSS. It's required
+	// when using PrivateKey.Sign.
 	Hash crypto.Hash
 }

-// HashFunc returns pssOpts.Hash so that PSSOptions implements
-// crypto.SignerOpts.
-func (pssOpts *PSSOptions) HashFunc() crypto.Hash {
-	return pssOpts.Hash
+// HashFunc returns opts.Hash so that PSSOptions implements crypto.SignerOpts.
+func (opts *PSSOptions) HashFunc() crypto.Hash {
+	return opts.Hash
 }

 func (opts *PSSOptions) saltLength() int {
@ -242,56 +257,50 @@ func (opts *PSSOptions) saltLength() int {
 	return opts.SaltLength
 }

-// SignPSS calculates the signature of hashed using RSASSA-PSS [1].
-// Note that hashed must be the result of hashing the input message using the
-// given hash function. The opts argument may be nil, in which case sensible
-// defaults are used.
-func SignPSS(rand io.Reader, priv *PrivateKey, hash crypto.Hash, hashed []byte, opts *PSSOptions) ([]byte, error) {
+// SignPSS calculates the signature of digest using PSS.
+//
+// digest must be the result of hashing the input message using the given hash
+// function. The opts argument may be nil, in which case sensible defaults are
+// used. If opts.Hash is set, it overrides hash.
+func SignPSS(rand io.Reader, priv *PrivateKey, hash crypto.Hash, digest []byte, opts *PSSOptions) ([]byte, error) {
+	if opts != nil && opts.Hash != 0 {
+		hash = opts.Hash
+	}
+
 	saltLength := opts.saltLength()
 	switch saltLength {
 	case PSSSaltLengthAuto:
-		saltLength = (priv.N.BitLen()+7)/8 - 2 - hash.Size()
+		saltLength = priv.Size() - 2 - hash.Size()
 	case PSSSaltLengthEqualsHash:
 		saltLength = hash.Size()
 	}

-	if opts != nil && opts.Hash != 0 {
-		hash = opts.Hash
-	}
-
 	salt := make([]byte, saltLength)
 	if _, err := io.ReadFull(rand, salt); err != nil {
 		return nil, err
 	}
-	return signPSSWithSalt(rand, priv, hash, hashed, salt)
+	return signPSSWithSalt(rand, priv, hash, digest, salt)
 }

 // VerifyPSS verifies a PSS signature.
-// hashed is the result of hashing the input message using the given hash
-// function and sig is the signature. A valid signature is indicated by
-// returning a nil error. The opts argument may be nil, in which case sensible
-// defaults are used.
-func VerifyPSS(pub *PublicKey, hash crypto.Hash, hashed []byte, sig []byte, opts *PSSOptions) error {
-	return verifyPSS(pub, hash, hashed, sig, opts.saltLength())
-}
-
-// verifyPSS verifies a PSS signature with the given salt length.
-func verifyPSS(pub *PublicKey, hash crypto.Hash, hashed []byte, sig []byte, saltLen int) error {
-	nBits := pub.N.BitLen()
-	if len(sig) != (nBits+7)/8 {
+//
+// A valid signature is indicated by returning a nil error. digest must be the
+// result of hashing the input message using the given hash function. The opts
+// argument may be nil, in which case sensible defaults are used. opts.Hash is
+// ignored.
+func VerifyPSS(pub *PublicKey, hash crypto.Hash, digest []byte, sig []byte, opts *PSSOptions) error {
+	if len(sig) != pub.Size() {
 		return ErrVerification
 	}
 	s := new(big.Int).SetBytes(sig)
 	m := encrypt(new(big.Int), pub, s)
-	emBits := nBits - 1
+	emBits := pub.N.BitLen() - 1
 	emLen := (emBits + 7) / 8
-	if emLen < len(m.Bytes()) {
+	emBytes := m.Bytes()
+	if emLen < len(emBytes) {
 		return ErrVerification
 	}
 	em := make([]byte, emLen)
-	copyWithLeftPad(em, m.Bytes())
-	if saltLen == PSSSaltLengthEqualsHash {
-		saltLen = hash.Size()
-	}
-	return emsaPSSVerify(hashed, em, emBits, saltLen, hash.New())
+	copyWithLeftPad(em, emBytes)
+	return emsaPSSVerify(digest, em, emBits, opts.saltLength(), hash.New())
 }
--- a/src/crypto/rsa/rsa.go
+++ b/src/crypto/rsa/rsa.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// Package rsa implements RSA encryption as specified in PKCS#1.
+// Package rsa implements RSA encryption as specified in PKCS#1 and RFC 8017.
 //
 // RSA is a single, fundamental operation that is used in this package to
 // implement either public-key encryption or public-key signatures.
@ -10,13 +10,13 @@
 // The original specification for encryption and signatures with RSA is PKCS#1
 // and the terms "RSA encryption" and "RSA signatures" by default refer to
 // PKCS#1 version 1.5. However, that specification has flaws and new designs
-// should use version two, usually called by just OAEP and PSS, where
+// should use version 2, usually called by just OAEP and PSS, where
 // possible.
 //
 // Two sets of interfaces are included in this package. When a more abstract
 // interface isn't necessary, there are functions for encrypting/decrypting
 // with v1.5/OAEP and signing/verifying with v1.5/PSS. If one needs to abstract
-// over the public-key primitive, the PrivateKey struct implements the
+// over the public key primitive, the PrivateKey type implements the
 // Decrypter and Signer interfaces from the crypto package.
 //
 // The RSA operations in this package are not implemented using constant-time algorithms.
@ -111,7 +111,8 @@ func (priv *PrivateKey) Public() crypto.PublicKey {

 // Sign signs digest with priv, reading randomness from rand. If opts is a
 // *PSSOptions then the PSS algorithm will be used, otherwise PKCS#1 v1.5 will
-// be used.
+// be used. digest must be the result of hashing the input message using
+// opts.HashFunc().
 //
 // This method implements crypto.Signer, which is an interface to support keys
 // where the private part is kept in, for example, a hardware module. Common
--- a/src/crypto/sha512/sha512block_generic.go
+++ b/src/crypto/sha512/sha512block_generic.go
@ -6,4 +6,6 @@

 package sha512

-var block = blockGeneric
+func block(dig *digest, p []byte) {
+	blockGeneric(dig, p)
+}
--- a/src/flag/flag.go
+++ b/src/flag/flag.go
@ -308,7 +308,7 @@ type ErrorHandling int
 // These constants cause FlagSet.Parse to behave as described if the parse fails.
 const (
 	ContinueOnError ErrorHandling = iota // Return a descriptive error.
-	ExitOnError                          // Call os.Exit(2).
+	ExitOnError                          // Call os.Exit(2) or for -h/-help Exit(0).
 	PanicOnError                         // Call panic with a descriptive error.
 )

@ -979,6 +979,9 @@ func (f *FlagSet) Parse(arguments []string) error {
 		case ContinueOnError:
 			return err
 		case ExitOnError:
+			if err == ErrHelp {
+				os.Exit(0)
+			}
 			os.Exit(2)
 		case PanicOnError:
 			panic(err)
--- a/src/flag/flag_test.go
+++ b/src/flag/flag_test.go
@ -8,9 +8,12 @@ import (
 	"bytes"
 	. "flag"
 	"fmt"
+	"internal/testenv"
 	"io"
 	"io/ioutil"
 	"os"
+	"os/exec"
+	"runtime"
 	"sort"
 	"strconv"
 	"strings"
@ -544,3 +547,66 @@ func TestRangeError(t *testing.T) {
 		}
 	}
 }
+
+func TestExitCode(t *testing.T) {
+	testenv.MustHaveExec(t)
+
+	magic := 123
+	if os.Getenv("GO_CHILD_FLAG") != "" {
+		fs := NewFlagSet("test", ExitOnError)
+		if os.Getenv("GO_CHILD_FLAG_HANDLE") != "" {
+			var b bool
+			fs.BoolVar(&b, os.Getenv("GO_CHILD_FLAG_HANDLE"), false, "")
+		}
+		fs.Parse([]string{os.Getenv("GO_CHILD_FLAG")})
+		os.Exit(magic)
+	}
+
+	tests := []struct {
+		flag       string
+		flagHandle string
+		expectExit int
+	}{
+		{
+			flag:       "-h",
+			expectExit: 0,
+		},
+		{
+			flag:       "-help",
+			expectExit: 0,
+		},
+		{
+			flag:       "-undefined",
+			expectExit: 2,
+		},
+		{
+			flag:       "-h",
+			flagHandle: "h",
+			expectExit: magic,
+		},
+		{
+			flag:       "-help",
+			flagHandle: "help",
+			expectExit: magic,
+		},
+	}
+
+	for _, test := range tests {
+		cmd := exec.Command(os.Args[0], "-test.run=TestExitCode")
+		cmd.Env = append(
+			os.Environ(),
+			"GO_CHILD_FLAG="+test.flag,
+			"GO_CHILD_FLAG_HANDLE="+test.flagHandle,
+		)
+		cmd.Run()
+		got := cmd.ProcessState.ExitCode()
+		// ExitCode is either 0 or 1 on Plan 9.
+		if runtime.GOOS == "plan9" && test.expectExit != 0 {
+			test.expectExit = 1
+		}
+		if got != test.expectExit {
+			t.Errorf("unexpected exit code for test case %+v \n: got %d, expect %d",
+				test, got, test.expectExit)
+		}
+	}
+}
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@ -199,7 +199,7 @@ var pkgDeps = map[string][]string{
 	"runtime/trace":  {"L0", "context", "fmt"},
 	"text/tabwriter": {"L2"},

-	"testing":                  {"L2", "flag", "fmt", "internal/race", "os", "runtime/debug", "runtime/pprof", "runtime/trace", "time"},
+	"testing":                  {"L2", "flag", "fmt", "internal/race", "io/ioutil", "os", "runtime/debug", "runtime/pprof", "runtime/trace", "time"},
 	"testing/iotest":           {"L2", "log"},
 	"testing/quick":            {"L2", "flag", "fmt", "reflect", "time"},
 	"internal/obscuretestdata": {"L2", "OS", "encoding/base64"},
--- a/src/io/ioutil/export_test.go
+++ b/src/io/ioutil/export_test.go
@ -0,0 +1,7 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ioutil
+
+var ErrPatternHasSeparator = errPatternHasSeparator
--- a/src/io/ioutil/ioutil_test.go
+++ b/src/io/ioutil/ioutil_test.go
@ -2,10 +2,11 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-package ioutil
+package ioutil_test

 import (
 	"bytes"
+	. "io/ioutil"
 	"os"
 	"path/filepath"
 	"testing"
--- a/src/io/ioutil/tempfile_test.go
+++ b/src/io/ioutil/tempfile_test.go
@ -2,9 +2,10 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-package ioutil
+package ioutil_test

 import (
+	. "io/ioutil"
 	"os"
 	"path/filepath"
 	"regexp"
@ -59,7 +60,7 @@ func TestTempFile_BadPattern(t *testing.T) {
 	tests := []struct {
 		pattern string
 		wantErr bool
-	} {
+	}{
 		{"ioutil*test", false},
 		{"ioutil_test*foo", false},
 		{"ioutil_test" + sep + "foo", true},
@ -80,7 +81,7 @@ func TestTempFile_BadPattern(t *testing.T) {
 				if err == nil {
 					t.Errorf("Expected an error for pattern %q", tt.pattern)
 				}
-				if g, w := err, errPatternHasSeparator; g != w {
+				if g, w := err, ErrPatternHasSeparator; g != w {
 					t.Errorf("Error mismatch: got %#v, want %#v for pattern %q", g, w, tt.pattern)
 				}
 			} else if err != nil {
@ -166,7 +167,7 @@ func TestTempDir_BadPattern(t *testing.T) {
 	tests := []struct {
 		pattern string
 		wantErr bool
-	} {
+	}{
 		{"ioutil*test", false},
 		{"ioutil_test*foo", false},
 		{"ioutil_test" + sep + "foo", true},
@ -182,7 +183,7 @@ func TestTempDir_BadPattern(t *testing.T) {
 				if err == nil {
 					t.Errorf("Expected an error for pattern %q", tt.pattern)
 				}
-				if g, w := err, errPatternHasSeparator; g != w {
+				if g, w := err, ErrPatternHasSeparator; g != w {
 					t.Errorf("Error mismatch: got %#v, want %#v for pattern %q", g, w, tt.pattern)
 				}
 			} else if err != nil {
--- a/src/net/http/roundtrip_js.go
+++ b/src/net/http/roundtrip_js.go
@ -102,12 +102,17 @@ func (t *Transport) RoundTrip(req *Request) (*Response, error) {
 		js.CopyBytesToJS(buf, body)
 		opt.Set("body", buf)
 	}
-	respPromise := js.Global().Call("fetch", req.URL.String(), opt)
+
+	fetchPromise := js.Global().Call("fetch", req.URL.String(), opt)
 	var (
-		respCh = make(chan *Response, 1)
-		errCh  = make(chan error, 1)
+		respCh           = make(chan *Response, 1)
+		errCh            = make(chan error, 1)
+		success, failure js.Func
 	)
-	success := js.FuncOf(func(this js.Value, args []js.Value) interface{} {
+	success = js.FuncOf(func(this js.Value, args []js.Value) interface{} {
+		success.Release()
+		failure.Release()
+
 		result := args[0]
 		header := Header{}
 		// https://developer.mozilla.org/en-US/docs/Web/API/Headers/entries
@ -141,35 +146,29 @@ func (t *Transport) RoundTrip(req *Request) (*Response, error) {
 		}

 		code := result.Get("status").Int()
-		select {
-		case respCh <- &Response{
+		respCh <- &Response{
 			Status:        fmt.Sprintf("%d %s", code, StatusText(code)),
 			StatusCode:    code,
 			Header:        header,
 			ContentLength: contentLength,
 			Body:          body,
 			Request:       req,
-		}:
-		case <-req.Context().Done():
 		}

 		return nil
 	})
-	defer success.Release()
-	failure := js.FuncOf(func(this js.Value, args []js.Value) interface{} {
-		err := fmt.Errorf("net/http: fetch() failed: %s", args[0].Get("message").String())
-		select {
-		case errCh <- err:
-		case <-req.Context().Done():
-		}
+	failure = js.FuncOf(func(this js.Value, args []js.Value) interface{} {
+		success.Release()
+		failure.Release()
+		errCh <- fmt.Errorf("net/http: fetch() failed: %s", args[0].Get("message").String())
 		return nil
 	})
-	defer failure.Release()
-	respPromise.Call("then", success, failure)
+
+	fetchPromise.Call("then", success, failure)
 	select {
 	case <-req.Context().Done():
 		if !ac.IsUndefined() {
-			// Abort the Fetch request
+			// Abort the Fetch request.
 			ac.Call("abort")
 		}
 		return nil, req.Context().Err()
--- a/src/net/ip.go
+++ b/src/net/ip.go
@ -671,8 +671,8 @@ func parseIPv6(s string) (ip IP) {
 }

 // ParseIP parses s as an IP address, returning the result.
-// The string s can be in dotted decimal ("192.0.2.1")
-// or IPv6 ("2001:db8::68") form.
+// The string s can be in IPv4 dotted decimal ("192.0.2.1"), IPv6
+// ("2001:db8::68"), or IPv4-mapped IPv6 ("::ffff:192.0.2.1") form.
 // If s is not a valid textual representation of an IP address,
 // ParseIP returns nil.
 func ParseIP(s string) IP {
--- a/src/run.bat
+++ b/src/run.bat
@ -4,7 +4,7 @@

@echo off

-if exist ..\bin\go goto ok
+if exist ..\bin\go.exe goto ok
 echo Must run run.bat from Go src directory after installing cmd/go.
 goto fail
 :ok
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@ -61,9 +61,9 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
 	RET

 TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
-again:
 	MOVD	ptr+0(FP), R0
 	MOVW	new+8(FP), R1
+again:
 	LDAXRW	(R0), R2
 	STLXRW	R1, (R0), R3
 	CBNZ	R3, again
@ -71,9 +71,9 @@ again:
 	RET

 TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
-again:
 	MOVD	ptr+0(FP), R0
 	MOVD	new+8(FP), R1
+again:
 	LDAXR	(R0), R2
 	STLXR	R1, (R0), R3
 	CBNZ	R3, again
@ -108,9 +108,9 @@ ok:
 //      *val += delta;
 //      return *val;
 TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
-again:
 	MOVD	ptr+0(FP), R0
 	MOVW	delta+8(FP), R1
+again:
 	LDAXRW	(R0), R2
 	ADDW	R2, R1, R2
 	STLXRW	R2, (R0), R3
@ -119,9 +119,9 @@ again:
 	RET

 TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
-again:
 	MOVD	ptr+0(FP), R0
 	MOVD	delta+8(FP), R1
+again:
 	LDAXR	(R0), R2
 	ADD	R2, R1, R2
 	STLXR	R2, (R0), R3
--- a/src/runtime/race/README
+++ b/src/runtime/race/README
@ -4,10 +4,10 @@ the LLVM project (https://github.com/llvm/llvm-project/tree/master/compiler-rt).

 To update the .syso files use golang.org/x/build/cmd/racebuild.

-race_darwin_amd64.syso built with LLVM 810ae8ddac890a6613d814c0b5415c7fcb7f5cca and Go 8c6876e9a481a2ea48070d3285a07163f564877b.
-race_freebsd_amd64.syso built with LLVM 810ae8ddac890a6613d814c0b5415c7fcb7f5cca and Go 8c6876e9a481a2ea48070d3285a07163f564877b.
-race_linux_amd64.syso built with LLVM 810ae8ddac890a6613d814c0b5415c7fcb7f5cca and Go 8c6876e9a481a2ea48070d3285a07163f564877b.
+race_darwin_amd64.syso built with LLVM 0fb8a5356214c47bbb832e89fbb3da1c755eeb73 and Go 95773ab9b053edc43ba07a182f3d5e0e29775a45.
+race_freebsd_amd64.syso built with LLVM 0fb8a5356214c47bbb832e89fbb3da1c755eeb73 and Go 95773ab9b053edc43ba07a182f3d5e0e29775a45.
+race_linux_amd64.syso built with LLVM 0fb8a5356214c47bbb832e89fbb3da1c755eeb73 and Go 95773ab9b053edc43ba07a182f3d5e0e29775a45.
 race_linux_ppc64le.syso built with LLVM 810ae8ddac890a6613d814c0b5415c7fcb7f5cca and Go 8c6876e9a481a2ea48070d3285a07163f564877b.
 race_netbsd_amd64.syso built with LLVM 810ae8ddac890a6613d814c0b5415c7fcb7f5cca and Go 8c6876e9a481a2ea48070d3285a07163f564877b.
-race_windows_amd64.syso built with LLVM 810ae8ddac890a6613d814c0b5415c7fcb7f5cca and Go 8c6876e9a481a2ea48070d3285a07163f564877b.
-race_linux_arm64.syso built with LLVM 810ae8ddac890a6613d814c0b5415c7fcb7f5cca and Go 8c6876e9a481a2ea48070d3285a07163f564877b.
+race_windows_amd64.syso built with LLVM 0fb8a5356214c47bbb832e89fbb3da1c755eeb73 and Go 95773ab9b053edc43ba07a182f3d5e0e29775a45.
+race_linux_arm64.syso built with LLVM 0fb8a5356214c47bbb832e89fbb3da1c755eeb73 and Go 95773ab9b053edc43ba07a182f3d5e0e29775a45.
--- a/src/runtime/race/race_darwin_amd64.syso
+++ b/src/runtime/race/race_darwin_amd64.syso
--- a/src/runtime/race/race_freebsd_amd64.syso
+++ b/src/runtime/race/race_freebsd_amd64.syso
--- a/src/runtime/race/race_linux_amd64.syso
+++ b/src/runtime/race/race_linux_amd64.syso
--- a/src/runtime/race/race_linux_arm64.syso
+++ b/src/runtime/race/race_linux_arm64.syso
--- a/src/runtime/race/race_windows_amd64.syso
+++ b/src/runtime/race/race_windows_amd64.syso
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@ -216,11 +216,12 @@ func stopTimer(t *timer) bool {

 // resetTimer resets an inactive timer, adding it to the heap.
 //go:linkname resetTimer time.resetTimer
-func resetTimer(t *timer, when int64) {
+// Reports whether the timer was modified before it was run.
+func resetTimer(t *timer, when int64) bool {
 	if raceenabled {
 		racerelease(unsafe.Pointer(t))
 	}
-	resettimer(t, when)
+	return resettimer(t, when)
 }

 // modTimer modifies an existing timer.
@ -403,13 +404,15 @@ func dodeltimer0(pp *p) {

 // modtimer modifies an existing timer.
 // This is called by the netpoll code or time.Ticker.Reset.
-func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) {
+// Reports whether the timer was modified before it was run.
+func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) bool {
 	if when < 0 {
 		when = maxWhen
 	}

 	status := uint32(timerNoStatus)
 	wasRemoved := false
+	var pending bool
 	var mp *m
 loop:
 	for {
@ -419,6 +422,7 @@ loop:
 			// This could lead to a self-deadlock. See #38070.
 			mp = acquirem()
 			if atomic.Cas(&t.status, status, timerModifying) {
+				pending = true // timer not yet run
 				break loop
 			}
 			releasem(mp)
@ -431,6 +435,7 @@ loop:
 			// Act like addtimer.
 			if atomic.Cas(&t.status, status, timerModifying) {
 				wasRemoved = true
+				pending = false // timer already run or stopped
 				break loop
 			}
 			releasem(mp)
@ -440,6 +445,7 @@ loop:
 			mp = acquirem()
 			if atomic.Cas(&t.status, status, timerModifying) {
 				atomic.Xadd(&t.pp.ptr().deletedTimers, -1)
+				pending = false // timer already stopped
 				break loop
 			}
 			releasem(mp)
@ -510,14 +516,17 @@ loop:
 			wakeNetPoller(when)
 		}
 	}
+
+	return pending
 }

 // resettimer resets the time when a timer should fire.
 // If used for an inactive timer, the timer will become active.
 // This should be called instead of addtimer if the timer value has been,
 // or may have been, used previously.
-func resettimer(t *timer, when int64) {
-	modtimer(t, when, t.period, t.f, t.arg, t.seq)
+// Reports whether the timer was modified before it was run.
+func resettimer(t *timer, when int64) bool {
+	return modtimer(t, when, t.period, t.f, t.arg, t.seq)
 }

 // cleantimers cleans up the head of the timer queue. This speeds up
--- a/src/testing/testing.go
+++ b/src/testing/testing.go
@ -239,6 +239,7 @@ import (
 	"fmt"
 	"internal/race"
 	"io"
+	"io/ioutil"
 	"os"
 	"runtime"
 	"runtime/debug"
@ -362,6 +363,10 @@ type common struct {
 	barrier  chan bool // To signal parallel subtests they may start.
 	signal   chan bool // To signal a test is done.
 	sub      []*T      // Queue of subtests to be run in parallel.
+
+	tempDirOnce sync.Once
+	tempDir     string
+	tempDirErr  error
 }

 // Short reports whether the -test.short flag is set.
@ -561,6 +566,7 @@ type TB interface {
 	SkipNow()
 	Skipf(format string, args ...interface{})
 	Skipped() bool
+	TempDir() string

 	// A private method to prevent users implementing the
 	// interface and so future additions to it will not
@ -791,6 +797,30 @@ func (c *common) Cleanup(f func()) {
 	}
 }

+// TempDir returns a temporary directory for the test to use.
+// It is lazily created on first access, and calls t.Fatal if the directory
+// creation fails.
+// Subsequent calls to t.TempDir return the same directory.
+// The directory is automatically removed by Cleanup when the test and
+// all its subtests complete.
+func (c *common) TempDir() string {
+	c.tempDirOnce.Do(func() {
+		c.Helper()
+		c.tempDir, c.tempDirErr = ioutil.TempDir("", c.Name())
+		if c.tempDirErr == nil {
+			c.Cleanup(func() {
+				if err := os.RemoveAll(c.tempDir); err != nil {
+					c.Errorf("TempDir RemoveAll cleanup: %v", err)
+				}
+			})
+		}
+	})
+	if c.tempDirErr != nil {
+		c.Fatalf("TempDir: %v", c.tempDirErr)
+	}
+	return c.tempDir
+}
+
 // panicHanding is an argument to runCleanup.
 type panicHandling int

--- a/src/testing/testing_test.go
+++ b/src/testing/testing_test.go
@ -5,6 +5,7 @@
 package testing_test

 import (
+	"io/ioutil"
 	"os"
 	"testing"
 )
@ -16,3 +17,50 @@ import (
 func TestMain(m *testing.M) {
 	os.Exit(m.Run())
 }
+
+func TestTempDir(t *testing.T) {
+	dirCh := make(chan string, 1)
+	t.Cleanup(func() {
+		// Verify directory has been removed.
+		select {
+		case dir := <-dirCh:
+			fi, err := os.Stat(dir)
+			if os.IsNotExist(err) {
+				// All good
+				return
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			t.Errorf("directory %q stil exists: %v, isDir=%v", dir, fi, fi.IsDir())
+		default:
+			if !t.Failed() {
+				t.Fatal("never received dir channel")
+			}
+		}
+	})
+
+	dir := t.TempDir()
+	if dir == "" {
+		t.Fatal("expected dir")
+	}
+	dir2 := t.TempDir()
+	if dir != dir2 {
+		t.Fatal("directory changed between calls")
+	}
+	dirCh <- dir
+	fi, err := os.Stat(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !fi.IsDir() {
+		t.Errorf("dir %q is not a dir", dir)
+	}
+	fis, err := ioutil.ReadDir(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(fis) > 0 {
+		t.Errorf("unexpected %d files in TempDir: %v", len(fis), fis)
+	}
+}
--- a/src/time/sleep.go
+++ b/src/time/sleep.go
@ -38,7 +38,7 @@ func when(d Duration) int64 {

 func startTimer(*runtimeTimer)
 func stopTimer(*runtimeTimer) bool
-func resetTimer(*runtimeTimer, int64)
+func resetTimer(*runtimeTimer, int64) bool
 func modTimer(t *runtimeTimer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr)

 // The Timer type represents a single event.
@ -123,9 +123,7 @@ func (t *Timer) Reset(d Duration) bool {
 		panic("time: Reset called on uninitialized Timer")
 	}
 	w := when(d)
-	active := stopTimer(&t.r)
-	resetTimer(&t.r, w)
-	return active
+	return resetTimer(&t.r, w)
 }

 func sendTime(c interface{}, seq uintptr) {
--- a/test/codegen/copy.go
+++ b/test/codegen/copy.go
@ -34,6 +34,8 @@ func movesmall7() {
 func movesmall16() {
 	x := [...]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	// amd64:-".*memmove"
+	// ppc64:".*memmove"
+	// ppc64le:".*memmove"
 	copy(x[1:], x[:])
 }

@ -41,10 +43,34 @@ var x [256]byte

 // Check that large disjoint copies are replaced with moves.

+func moveDisjointStack32() {
+        var s [32]byte
+        // ppc64:-".*memmove"
+        // ppc64le:-".*memmove"
+        // ppc64le/power8:"LXVD2X",-"ADD",-"BC"
+        // ppc64le/power9:"LXV",-"LXVD2X",-"ADD",-"BC"
+        copy(s[:], x[:32])
+        runtime.KeepAlive(&s)
+}
+
+func moveDisjointStack64() {
+        var s [96]byte
+        // ppc64:-".*memmove"
+        // ppc64le:-".*memmove"
+        // ppc64le/power8:"LXVD2X","ADD","BC"
+        // ppc64le/power9:"LXV",-"LXVD2X",-"ADD",-"BC"
+        copy(s[:], x[:96])
+        runtime.KeepAlive(&s)
+}
+
 func moveDisjointStack() {
 	var s [256]byte
 	// s390x:-".*memmove"
 	// amd64:-".*memmove"
+	// ppc64:-".*memmove"
+	// ppc64le:-".*memmove"
+	// ppc64le/power8:"LXVD2X"
+	// ppc64le/power9:"LXV",-"LXVD2X"
 	copy(s[:], x[:])
 	runtime.KeepAlive(&s)
 }
@ -53,6 +79,10 @@ func moveDisjointArg(b *[256]byte) {
 	var s [256]byte
 	// s390x:-".*memmove"
 	// amd64:-".*memmove"
+	// ppc64:-".*memmove"
+	// ppc64le:-".*memmove"
+	// ppc64le/power8:"LXVD2X"
+	// ppc64le/power9:"LXV",-"LXVD2X"
 	copy(s[:], b[:])
 	runtime.KeepAlive(&s)
 }
@ -60,6 +90,10 @@ func moveDisjointArg(b *[256]byte) {
 func moveDisjointNoOverlap(a *[256]byte) {
 	// s390x:-".*memmove"
 	// amd64:-".*memmove"
+	// ppc64:-".*memmove"
+	// ppc64le:-".*memmove"
+	// ppc64le/power8:"LXVD2X"
+	// ppc64le/power9:"LXV",-"LXVD2X"
 	copy(a[:], a[128:])
 }

--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@ -110,8 +110,9 @@ func Len8(n uint8) int {
 //    bits.OnesCount    //
 // -------------------- //

+// amd64:".*x86HasPOPCNT"
 func OnesCount(n uint) int {
-	// amd64:"POPCNTQ",".*x86HasPOPCNT"
+	// amd64:"POPCNTQ"
 	// arm64:"VCNT","VUADDLV"
 	// s390x:"POPCNT"
 	// ppc64:"POPCNTD"
@ -120,8 +121,9 @@ func OnesCount(n uint) int {
 	return bits.OnesCount(n)
 }

+// amd64:".*x86HasPOPCNT"
 func OnesCount64(n uint64) int {
-	// amd64:"POPCNTQ",".*x86HasPOPCNT"
+	// amd64:"POPCNTQ"
 	// arm64:"VCNT","VUADDLV"
 	// s390x:"POPCNT"
 	// ppc64:"POPCNTD"
@ -130,8 +132,9 @@ func OnesCount64(n uint64) int {
 	return bits.OnesCount64(n)
 }

+// amd64:".*x86HasPOPCNT"
 func OnesCount32(n uint32) int {
-	// amd64:"POPCNTL",".*x86HasPOPCNT"
+	// amd64:"POPCNTL"
 	// arm64:"VCNT","VUADDLV"
 	// s390x:"POPCNT"
 	// ppc64:"POPCNTW"
@ -140,8 +143,9 @@ func OnesCount32(n uint32) int {
 	return bits.OnesCount32(n)
 }

+// amd64:".*x86HasPOPCNT"
 func OnesCount16(n uint16) int {
-	// amd64:"POPCNTL",".*x86HasPOPCNT"
+	// amd64:"POPCNTL"
 	// arm64:"VCNT","VUADDLV"
 	// s390x:"POPCNT"
 	// ppc64:"POPCNTW"
--- a/test/inline.go
+++ b/test/inline.go
@ -180,3 +180,21 @@ func (T) meth2(int, int) { // not inlineable - has 2 calls.
 	runtime.GC()
 	runtime.GC()
 }
+
+// Issue #29737 - make sure we can do inlining for a chain of recursive functions
+func ee() { // ERROR "can inline ee"
+	ff(100) // ERROR "inlining call to ff" "inlining call to gg" "inlining call to hh"
+}
+
+func ff(x int) { // ERROR "can inline ff"
+	if x < 0 {
+		return
+	}
+	gg(x - 1)
+}
+func gg(x int) { // ERROR "can inline gg"
+	hh(x - 1)
+}
+func hh(x int) { // ERROR "can inline hh"
+	ff(x - 1) // ERROR "inlining call to ff"  // ERROR "inlining call to gg"
+}
--- a/test/nowritebarrier.go
+++ b/test/nowritebarrier.go
@ -67,6 +67,7 @@ func d2() {
 	d3()
 }

+//go:noinline
 func d3() {
 	x.f = y // ERROR "write barrier prohibited by caller"
 	d4()