[dev.boringcrypto] all: merge master into dev.boringcrypto

Change-Id: If0a6a3d0abf15d9584ce572510b5bb31872d432f
2025-05-30 19:52:53 +00:00 · 2021-11-08 14:46:41 -05:00 · 2021-11-08 14:46:41 -05:00 · c9858c7bdc
commit c9858c7bdc
parent ed07c49cb6 035963c7f5
90 changed files with 12935 additions and 3333 deletions
--- a/doc/go1.18.html
+++ b/doc/go1.18.html
@ -31,19 +31,30 @@ Do not send CLs removing the interior tags from such phrases.

 <h2 id="ports">Ports</h2>

-<p id="freebsd">
+<h3 id="freebsd">FreeBSD</h3>
+
+<p>
  Go 1.18 is the last release that is supported on FreeBSD 11.x, which has
  already reached end-of-life. Go 1.19 will require FreeBSD 12.2+ or FreeBSD
  13.0+.
  FreeBSD 13.0+ will require a kernel with the COMPAT_FREEBSD12 option set (this is the default).
 </p>

-<h2 id="tools">Tools</h2>
+<h3 id="ppc64">PPC64</h3>

-<p>
-  TODO: complete this section, or delete if not needed
+<p><!-- CL 353969 -->
+  TODO: <a href="https://golang.org/cl/353969">https://golang.org/cl/353969</a>: internal/buildcfg: enable register ABI for PPC64
 </p>

+<h3 id="riscv">RISC-V</h3>
+
+<p><!-- golang.org/issue/47100, CL 334872 -->
+  The 64-bit RISC-V architecture on Linux (the <code>linux/riscv64</code> port)
+  now supports the <code>c-archive</code> and <code>c-shared</code> build modes.
+</p>
+
+<h2 id="tools">Tools</h2>
+
 <h3 id="go-command">Go command</h3>

 <p><!-- golang.org/issue/43684 -->
@ -103,8 +114,8 @@ Do not send CLs removing the interior tags from such phrases.
  <code>go</code> <code>mod</code> <code>download</code> <code>all</code>.
 </p>

-<p>
-  TODO: complete this section, or delete if not needed
+<p><!-- CL 349595 -->
+  TODO: <a href="https://golang.org/cl/349595">https://golang.org/cl/349595</a>: https://golang.org/cl/349595: cmd/go: add GOAMD64 environment variable
 </p>

 <h3 id="gofmt"><code>gofmt</code></h3>
@ -115,7 +126,6 @@ Do not send CLs removing the interior tags from such phrases.
  multiple CPUs, <code>gofmt</code> should now be significantly faster.
 </p>

-
 <h2 id="runtime">Runtime</h2>

 <p>
@ -124,24 +134,30 @@ Do not send CLs removing the interior tags from such phrases.

 <h2 id="compiler">Compiler</h2>

-<p>
-  TODO: complete this section, or delete if not needed
+<p><!-- CL 298611 -->
+  TODO: <a href="https://golang.org/cl/298611">https://golang.org/cl/298611</a>: https://golang.org/cl/298611: cmd/compile: add -asan option
+</p>
+
+<p><!-- CL 352057 -->
+  TODO: <a href="https://golang.org/cl/352057">https://golang.org/cl/352057</a>: https://golang.org/cl/352057: cmd/compile, runtime: track argument stack slot liveness
 </p>

 <h2 id="linker">Linker</h2>

-<p>
-  TODO: complete this section, or delete if not needed
+<p><!-- CL 298610 -->
+  TODO: <a href="https://golang.org/cl/298610">https://golang.org/cl/298610</a>: https://golang.org/cl/298610: cmd/link: add -asan option
 </p>

 <h2 id="library">Core library</h2>

-<h3>TODO</h3>
-<p>
-  TODO: complete this section
+<h3 id="constraints">New <code>constraints</code> package</h3>
+
+<p><!-- CL 349709 -->
+  TODO: <a href="https://golang.org/cl/349709">https://golang.org/cl/349709</a>: constraints: new package
 </p>

 <h3 id="netip">New <code>net/netip</code> package</h3>
+
 <p>
  The new <a href="/pkg/net/netip/"><code>net/netip</code></a>
  package defines a new IP address type, <a href="/pkg/net/netip/#Addr"><code>Addr</code></a>.
@ -163,6 +179,12 @@ Do not send CLs removing the interior tags from such phrases.
  <code>*net.UDPAddr</code> values.
 </p>

+<h3>TODO</h3>
+
+<p>
+  TODO: complete this section
+</p>
+
 <h3 id="minor_library_changes">Minor changes to the library</h3>

 <p>
@ -175,6 +197,26 @@ Do not send CLs removing the interior tags from such phrases.
  TODO: complete this section
 </p>

+<dl id="bufio"><dt><a href="/pkg/bufio/">bufio</a></dt>
+  <dd>
+    <p><!-- CL 345569 -->
+      TODO: <a href="https://golang.org/cl/345569">https://golang.org/cl/345569</a>: add Writer.AvailableBuffer
+    </p>
+
+    <p><!-- CL 345570 -->
+      TODO: <a href="https://golang.org/cl/345570">https://golang.org/cl/345570</a>: make Reader.Reset and Writer.Reset work on the zero value
+    </p>
+  </dd>
+</dl><!-- bufio -->
+
+<dl id="crypto/tls"><dt><a href="/pkg/crypto/tls/">crypto/tls</a></dt>
+  <dd>
+    <p><!-- CL 325250 -->
+      TODO: <a href="https://golang.org/cl/325250">https://golang.org/cl/325250</a>: add Conn.NetConn method
+    </p>
+  </dd>
+</dl><!-- crypto/tls -->
+
 <dl id="debug/buildinfo"><dt><a href="/pkg/debug/buildinfo">debug/buildinfo</a></dt>
  <dd>
    <p><!-- golang.org/issue/39301 -->
@ -201,9 +243,33 @@ Do not send CLs removing the interior tags from such phrases.
  </dd>
 </dl><!-- image/draw -->

+<dl id="net"><dt><a href="/pkg/net/">net</a></dt>
+  <dd>
+    <p><!-- CL 340261 -->
+      TODO: <a href="https://golang.org/cl/340261">https://golang.org/cl/340261</a>: deprecate (net.Error).Temporary
+    </p>
+  </dd>
+</dl><!-- net -->
+
+<dl id="net/http"><dt><a href="/pkg/net/http/">net/http</a></dt>
+  <dd>
+    <p><!-- CL 338590 -->
+      TODO: <a href="https://golang.org/cl/338590">https://golang.org/cl/338590</a>: add Cookie.Valid method
+    </p>
+  </dd>
+</dl><!-- net/http -->
+
+<dl id="os/user"><dt><a href="/pkg/os/user/">os/user</a></dt>
+  <dd>
+    <p><!-- CL 330753 -->
+      TODO: <a href="https://golang.org/cl/330753">https://golang.org/cl/330753</a>: implement go native GroupIds
+    </p>
+  </dd>
+</dl><!-- os/user -->
+
 <dl id="reflect"><dt><a href="/pkg/reflect/">reflect</a></dt>
  <dd>
-    <p><!-- CL 356049, 320929 -->
+    <p><!-- CL 356049, CL 320929 -->
      The new
      <a href="/pkg/reflect/#Value.SetIterKey"><code>Value.SetIterKey</code></a>
      and <a href="/pkg/reflect/#Value.SetIterValue"><code>Value.SetIterValue</code></a>
@ -211,8 +277,7 @@ Do not send CLs removing the interior tags from such phrases.
      <code>Value.Set(iter.Key())</code> and <code>Value.Set(iter.Value())</code> but
      do fewer allocations.
    </p>
-  </dd>
-  <dd>
+
    <p><!-- CL 350691 -->
      The new
      <a href="/pkg/reflect/#Value.UnsafePointer"><code>Value.UnsafePointer</code></a>
@ -221,9 +286,69 @@ Do not send CLs removing the interior tags from such phrases.
      and <a href="/pkg/reflect/#Value.Pointer"><code>Value.Pointer</code></a>
      to eliminate the need to perform uintptr to unsafe.Pointer conversions at the callsite (as unsafe.Pointer rules require).
    </p>
+
+    <p><!-- CL 321889 -->
+      TODO: <a href="https://golang.org/cl/321889">https://golang.org/cl/321889</a>: allocate hiter as part of MapIter
+    </p>
+
+    <p><!-- CL 321891 -->
+      TODO: <a href="https://golang.org/cl/321891">https://golang.org/cl/321891</a>: add MapIter.Reset
+    </p>
+
+    <p><!-- CL 345486 -->
+      TODO: <a href="https://golang.org/cl/345486">https://golang.org/cl/345486</a>: optimize for maps with string keys
+    </p>
+
+    <p><!-- CL 352131 -->
+      TODO: <a href="https://golang.org/cl/352131">https://golang.org/cl/352131</a>: add Value.{CanInt, CanUint, CanFloat, CanComplex}
+    </p>
+
+    <p><!-- CL 357962 -->
+      TODO: <a href="https://golang.org/cl/357962">https://golang.org/cl/357962</a>: add FieldByIndexErr
+    </p>
  </dd>
 </dl><!-- reflect -->

+<dl id="regexp"><dt><a href="/pkg/regexp/">regexp</a></dt>
+  <dd>
+    <p><!-- CL 354569 -->
+      TODO: <a href="https://golang.org/cl/354569">https://golang.org/cl/354569</a>: document and implement that invalid UTF-8 bytes are the same as U+FFFD
+    </p>
+  </dd>
+</dl><!-- regexp -->
+
+<dl id="strconv"><dt><a href="/pkg/strconv/">strconv</a></dt>
+  <dd>
+    <p><!-- CL 343877 -->
+      TODO: <a href="https://golang.org/cl/343877">https://golang.org/cl/343877</a>: reject surrogate halves in Unquote
+    </p>
+  </dd>
+</dl><!-- strconv -->
+
+<dl id="strings"><dt><a href="/pkg/strings/">strings</a></dt>
+  <dd>
+    <p><!-- CL 345849 -->
+      TODO: <a href="https://golang.org/cl/345849">https://golang.org/cl/345849</a>: add Clone function
+    </p>
+  </dd>
+</dl><!-- strings -->
+
+<dl id="strings,bytes"><dt><a href="/pkg/strings,bytes/">strings,bytes</a></dt>
+  <dd>
+    <p><!-- CL 332771 -->
+      TODO: <a href="https://golang.org/cl/332771">https://golang.org/cl/332771</a>: avoid allocations in Trim/TrimLeft/TrimRight
+    </p>
+  </dd>
+</dl><!-- strings,bytes -->
+
+<dl id="sync"><dt><a href="/pkg/sync/">sync</a></dt>
+  <dd>
+    <p><!-- CL 319769 -->
+      TODO: <a href="https://golang.org/cl/319769">https://golang.org/cl/319769</a>: add Mutex.TryLock, RWMutex.TryLock, RWMutex.TryRLock
+    </p>
+  </dd>
+</dl><!-- sync -->
+
 <dl id="syscall"><dt><a href="/pkg/syscall/">syscall</a></dt>
  <dd>
    <p><!-- CL 336550 -->
@ -238,5 +363,45 @@ Do not send CLs removing the interior tags from such phrases.
      <a href="/pkg/syscall/?GOOS=windows#Syscall18"><code>Syscall18</code></a> are
      deprecated in favor of <a href="/pkg/syscall/?GOOS=windows#SyscallN"><code>SyscallN</code></a>.
    </p>
+
+    <p><!-- CL 355570 -->
+      TODO: <a href="https://golang.org/cl/355570">https://golang.org/cl/355570</a>: add support for SysProcAttr.Pdeathsig on FreeBSD
+    </p>
  </dd>
 </dl><!-- syscall -->
+
+<dl id="syscall/js"><dt><a href="/pkg/syscall/js/">syscall/js</a></dt>
+  <dd>
+    <p><!-- CL 356430 -->
+      TODO: <a href="https://golang.org/cl/356430">https://golang.org/cl/356430</a>: remove Wrapper interface
+    </p>
+  </dd>
+</dl><!-- syscall/js -->
+
+<dl id="testing"><dt><a href="/pkg/testing/">testing</a></dt>
+  <dd>
+    <p><!-- CL 343883 -->
+      TODO: <a href="https://golang.org/cl/343883">https://golang.org/cl/343883</a>: increase alternation precedence
+    </p>
+
+    <p><!-- CL 356669 -->
+      TODO: <a href="https://golang.org/cl/356669">https://golang.org/cl/356669</a>: skip extra -count iterations if there are no tests
+    </p>
+  </dd>
+</dl><!-- testing -->
+
+<dl id="text/template"><dt><a href="/pkg/text/template/">text/template</a></dt>
+  <dd>
+    <p><!-- CL 321490 -->
+      TODO: <a href="https://golang.org/cl/321490">https://golang.org/cl/321490</a>: implement short-circuit and, or
+    </p>
+  </dd>
+</dl><!-- text/template -->
+
+<dl id="unicode/utf8"><dt><a href="/pkg/unicode/utf8/">unicode/utf8</a></dt>
+  <dd>
+    <p><!-- CL 345571 -->
+      TODO: <a href="https://golang.org/cl/345571">https://golang.org/cl/345571</a>: add AppendRune
+    </p>
+  </dd>
+</dl><!-- unicode/utf8 -->
--- a/src/bufio/bufio_test.go
+++ b/src/bufio/bufio_test.go
@ -1520,7 +1520,7 @@ func TestReaderDiscard(t *testing.T) {
 			wantBuffered: 0,
 		},
 		// Any error from filling shouldn't show up until we
-		// get past the valid bytes. Here we return we return 5 valid bytes at the same time
+		// get past the valid bytes. Here we return 5 valid bytes at the same time
 		// as an error, but test that we don't see the error from Discard.
 		{
 			name: "fill error, discard less",
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@ -746,7 +746,8 @@ func isSeparator(r rune) bool {
 // Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
 // words mapped to their title case.
 //
-// BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
+// Deprecated: The rule Title uses for word boundaries does not handle Unicode
+// punctuation properly. Use golang.org/x/text/cases instead.
 func Title(s []byte) []byte {
 	// Use a closure here to remember state.
 	// Hackish but effective. Depends on Map scanning in order and calling
--- a/src/bytes/example_test.go
+++ b/src/bytes/example_test.go
@ -37,6 +37,16 @@ func ExampleBuffer_Bytes() {
 	// Output: hello world
 }

+func ExampleBuffer_Cap() {
+	buf1 := bytes.NewBuffer(make([]byte, 10))
+	buf2 := bytes.NewBuffer(make([]byte, 0, 10))
+	fmt.Println(buf1.Cap())
+	fmt.Println(buf2.Cap())
+	// Output:
+	// 10
+	// 10
+}
+
 func ExampleBuffer_Grow() {
 	var b bytes.Buffer
 	b.Grow(64)
@ -67,6 +77,39 @@ func ExampleBuffer_Next() {
 	// e
 }

+func ExampleBuffer_Read() {
+	var b bytes.Buffer
+	b.Grow(64)
+	b.Write([]byte("abcde"))
+	rdbuf := make([]byte, 1)
+	n, err := b.Read(rdbuf)
+	if err != nil {
+		panic(err)
+	}
+	fmt.Println(n)
+	fmt.Println(b.String())
+	fmt.Println(string(rdbuf))
+	// Output
+	// 1
+	// bcde
+	// a
+}
+
+func ExampleBuffer_ReadByte() {
+	var b bytes.Buffer
+	b.Grow(64)
+	b.Write([]byte("abcde"))
+	c, err := b.ReadByte()
+	if err != nil {
+		panic(err)
+	}
+	fmt.Println(c)
+	fmt.Println(b.String())
+	// Output
+	// 97
+	// bcde
+}
+
 func ExampleCompare() {
 	// Interpret Compare's result by comparing it to zero.
 	var a, b []byte
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@ -309,7 +309,7 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 			break
 		}

-		if fn := inlCallee(n.X); fn != nil && fn.Inl != nil {
+		if fn := inlCallee(n.X); fn != nil && typecheck.HaveInlineBody(fn) {
 			v.budget -= fn.Inl.Cost
 			break
 		}
@ -585,7 +585,7 @@ func inlnode(n ir.Node, maxCost int32, inlMap map[*ir.Func]bool, edit func(ir.No
 		if ir.IsIntrinsicCall(call) {
 			break
 		}
-		if fn := inlCallee(call.X); fn != nil && fn.Inl != nil {
+		if fn := inlCallee(call.X); fn != nil && typecheck.HaveInlineBody(fn) {
 			n = mkinlcall(call, fn, maxCost, inlMap, edit)
 		}
 	}
--- a/src/cmd/compile/internal/ssa/expand_calls.go
+++ b/src/cmd/compile/internal/ssa/expand_calls.go
@ -954,11 +954,11 @@ func (x *expandState) storeArgOrLoad(pos src.XPos, b *Block, source, mem *Value,
 		elt := t.Elem()
 		if source.Type != t && t.NumElem() == 1 && elt.Size() == t.Size() && t.Size() == x.regSize {
 			t = removeTrivialWrapperTypes(t)
-			source.Type = t
 			// it could be a leaf type, but the "leaf" could be complex64 (for example)
 			return x.storeArgOrLoad(pos, b, source, mem, t, storeOffset, loadRegOffset, storeRc)
 		}
 		eltRO := x.regWidth(elt)
+		source.Type = t
 		for i := int64(0); i < t.NumElem(); i++ {
 			sel := source.Block.NewValue1I(pos, OpArraySelect, elt, i, source)
 			mem = x.storeArgOrLoad(pos, b, sel, mem, elt, storeOffset+i*elt.Size(), loadRegOffset, storeRc.at(t, 0))
@ -988,11 +988,11 @@ func (x *expandState) storeArgOrLoad(pos src.XPos, b *Block, source, mem *Value,
 			// v139 is later stored as an intVal == struct{val *big.Int} which naively requires the fields of
 			// of a *uint8, which does not succeed.
 			t = removeTrivialWrapperTypes(t)
-			source.Type = t
 			// it could be a leaf type, but the "leaf" could be complex64 (for example)
 			return x.storeArgOrLoad(pos, b, source, mem, t, storeOffset, loadRegOffset, storeRc)
 		}

+		source.Type = t
 		for i := 0; i < t.NumFields(); i++ {
 			fld := t.Field(i)
 			sel := source.Block.NewValue1I(pos, OpStructSelect, fld.Type, int64(i), source)
--- a/src/cmd/compile/internal/ssa/stmtlines_test.go
+++ b/src/cmd/compile/internal/ssa/stmtlines_test.go
@ -89,6 +89,9 @@ func TestStmtLines(t *testing.T) {
 		if pkgname == "runtime" {
 			continue
 		}
+		if pkgname == "crypto/elliptic/internal/fiat" {
+			continue // golang.org/issue/49372
+		}
 		if e.Val(dwarf.AttrStmtList) == nil {
 			continue
 		}
--- a/src/cmd/compile/internal/typecheck/crawler.go
+++ b/src/cmd/compile/internal/typecheck/crawler.go
@ -207,7 +207,7 @@ func (p *crawler) markInlBody(n *ir.Name) {
 	if fn == nil {
 		base.Fatalf("markInlBody: missing Func on %v", n)
 	}
-	if fn.Inl == nil {
+	if !HaveInlineBody(fn) {
 		return
 	}

--- a/src/cmd/compile/internal/typecheck/iimport.go
+++ b/src/cmd/compile/internal/typecheck/iimport.go
@ -81,6 +81,27 @@ func ImportBody(fn *ir.Func) {
 	inimport = false
 }

+// HaveInlineBody reports whether we have fn's inline body available
+// for inlining.
+func HaveInlineBody(fn *ir.Func) bool {
+	if fn.Inl == nil {
+		return false
+	}
+
+	// Unified IR is much more conservative about pruning unreachable
+	// methods (at the cost of increased build artifact size).
+	if base.Debug.Unified != 0 {
+		return true
+	}
+
+	if fn.Inl.Body != nil {
+		return true
+	}
+
+	_, ok := inlineImporter[fn.Nname.Sym()]
+	return ok
+}
+
 func importReaderFor(sym *types.Sym, importers map[*types.Sym]iimporterAndOffset) *importReader {
 	x, ok := importers[sym]
 	if !ok {
--- a/src/cmd/dist/test.go
+++ b/src/cmd/dist/test.go
@ -1013,7 +1013,7 @@ func (t *tester) internalLink() bool {
 func (t *tester) internalLinkPIE() bool {
 	switch goos + "-" + goarch {
 	case "darwin-amd64", "darwin-arm64",
-		"linux-amd64", "linux-arm64",
+		"linux-amd64", "linux-arm64", "linux-ppc64le",
 		"android-arm64",
 		"windows-amd64", "windows-386", "windows-arm":
 		return true
--- a/src/cmd/go/internal/modload/init.go
+++ b/src/cmd/go/internal/modload/init.go
@ -968,7 +968,7 @@ func makeMainModules(ms []module.Version, rootDirs []string, modFiles []*modfile
 			for _, r := range modFiles[i].Replace {
 				if replacedByWorkFile[r.Old.Path] {
 					continue
-				} else if prev, ok := replacements[r.Old]; ok && !curModuleReplaces[r.Old] {
+				} else if prev, ok := replacements[r.Old]; ok && !curModuleReplaces[r.Old] && prev != r.New {
 					base.Fatalf("go: conflicting replacements for %v:\n\t%v\n\t%v\nuse \"go mod editwork -replace %v=[override]\" to resolve", r.Old, prev, r.New, r.Old)
 				}
 				curModuleReplaces[r.Old] = true
--- a/src/cmd/go/internal/modload/modfile.go
+++ b/src/cmd/go/internal/modload/modfile.go
@ -378,7 +378,7 @@ func canonicalizeReplacePath(r module.Version, modRoot string) module.Version {
 		return r
 	}
 	abs := filepath.Join(modRoot, r.Path)
-	if rel, err := filepath.Rel(workFilePath, abs); err == nil {
+	if rel, err := filepath.Rel(filepath.Dir(workFilePath), abs); err == nil {
 		return module.Version{Path: rel, Version: r.Version}
 	}
 	// We couldn't make the version's path relative to the workspace's path,
--- a/src/cmd/go/testdata/script/gcflags_patterns.txt
+++ b/src/cmd/go/testdata/script/gcflags_patterns.txt
@ -58,8 +58,7 @@ go build -n -ldflags=-X=math.pi=3
 stderr 'link.* -X=math.pi=3'

 # -ldflags applies to current directory even if GOPATH is funny
-[windows] cd $WORK/GoPath/src/my/cmd/prog
-[darwin] cd $WORK/GoPath/src/my/cmd/prog
+[!case-sensitive] cd $WORK/GoPath/src/my/cmd/prog
 go build -n -ldflags=-X=math.pi=3
 stderr 'link.* -X=math.pi=3'

--- a/src/cmd/internal/sys/supported.go
+++ b/src/cmd/internal/sys/supported.go
@ -158,7 +158,7 @@ func BuildModeSupported(compiler, buildmode, goos, goarch string) bool {
 func InternalLinkPIESupported(goos, goarch string) bool {
 	switch goos + "/" + goarch {
 	case "darwin/amd64", "darwin/arm64",
-		"linux/amd64", "linux/arm64",
+		"linux/amd64", "linux/arm64", "linux/ppc64le",
 		"android/arm64",
 		"windows-amd64", "windows-386", "windows-arm":
 		return true
--- a/src/cmd/link/internal/ld/config.go
+++ b/src/cmd/link/internal/ld/config.go
@ -225,7 +225,8 @@ func mustLinkExternal(ctxt *Link) (res bool, reason string) {
 		return true, "buildmode=c-shared"
 	case BuildModePIE:
 		switch buildcfg.GOOS + "/" + buildcfg.GOARCH {
-		case "linux/amd64", "linux/arm64", "android/arm64":
+		case "android/arm64":
+		case "linux/amd64", "linux/arm64", "linux/ppc64le":
 		case "windows/386", "windows/amd64", "windows/arm", "windows/arm64":
 		case "darwin/amd64", "darwin/arm64":
 		default:
--- a/src/cmd/link/internal/ld/data.go
+++ b/src/cmd/link/internal/ld/data.go
@ -227,6 +227,8 @@ func (st *relocSymState) relocsym(s loader.Sym, P []byte) {
 					// DWARF info between the compiler and linker.
 					continue
 				}
+			} else if target.IsPPC64() && target.IsPIE() && ldr.SymName(rs) == ".TOC." {
+				// This is a TOC relative relocation generated from a go object. It is safe to resolve.
 			} else {
 				st.err.errorUnresolved(ldr, s, rs)
 				continue
--- a/src/cmd/link/internal/ppc64/asm.go
+++ b/src/cmd/link/internal/ppc64/asm.go
@ -321,6 +321,11 @@ func addelfdynrel(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, s lo
 			rela.AddUint64(target.Arch, elf.R_INFO(uint32(ldr.SymDynid(targ)), uint32(elf.R_PPC64_ADDR64)))
 			rela.AddUint64(target.Arch, uint64(r.Add()))
 			su.SetRelocType(rIdx, objabi.ElfRelocOffset) // ignore during relocsym
+		} else if target.IsPIE() && target.IsInternal() {
+			// For internal linking PIE, this R_ADDR relocation cannot
+			// be resolved statically. We need to generate a dynamic
+			// relocation. Let the code below handle it.
+			break
 		}
 		return true

@ -383,12 +388,94 @@ func addelfdynrel(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, s lo
 	}

 	// Handle references to ELF symbols from our own object files.
-	if targType != sym.SDYNIMPORT {
+	relocs := ldr.Relocs(s)
+	r = relocs.At(rIdx)
+
+	switch r.Type() {
+	case objabi.R_ADDR:
+		if ldr.SymType(s) == sym.STEXT {
+			log.Fatalf("R_ADDR relocation in text symbol %s is unsupported\n", ldr.SymName(s))
+		}
+		if target.IsPIE() && target.IsInternal() {
+			// When internally linking, generate dynamic relocations
+			// for all typical R_ADDR relocations. The exception
+			// are those R_ADDR that are created as part of generating
+			// the dynamic relocations and must be resolved statically.
+			//
+			// There are three phases relevant to understanding this:
+			//
+			//	dodata()  // we are here
+			//	address() // symbol address assignment
+			//	reloc()   // resolution of static R_ADDR relocs
+			//
+			// At this point symbol addresses have not been
+			// assigned yet (as the final size of the .rela section
+			// will affect the addresses), and so we cannot write
+			// the Elf64_Rela.r_offset now. Instead we delay it
+			// until after the 'address' phase of the linker is
+			// complete. We do this via Addaddrplus, which creates
+			// a new R_ADDR relocation which will be resolved in
+			// the 'reloc' phase.
+			//
+			// These synthetic static R_ADDR relocs must be skipped
+			// now, or else we will be caught in an infinite loop
+			// of generating synthetic relocs for our synthetic
+			// relocs.
+			//
+			// Furthermore, the rela sections contain dynamic
+			// relocations with R_ADDR relocations on
+			// Elf64_Rela.r_offset. This field should contain the
+			// symbol offset as determined by reloc(), not the
+			// final dynamically linked address as a dynamic
+			// relocation would provide.
+			switch ldr.SymName(s) {
+			case ".dynsym", ".rela", ".rela.plt", ".got.plt", ".dynamic":
+				return false
+			}
+		} else {
+			// Either internally linking a static executable,
+			// in which case we can resolve these relocations
+			// statically in the 'reloc' phase, or externally
+			// linking, in which case the relocation will be
+			// prepared in the 'reloc' phase and passed to the
+			// external linker in the 'asmb' phase.
+			if ldr.SymType(s) != sym.SDATA && ldr.SymType(s) != sym.SRODATA {
+				break
+			}
+		}
+		// Generate R_PPC64_RELATIVE relocations for best
+		// efficiency in the dynamic linker.
+		//
+		// As noted above, symbol addresses have not been
+		// assigned yet, so we can't generate the final reloc
+		// entry yet. We ultimately want:
+		//
+		// r_offset = s + r.Off
+		// r_info = R_PPC64_RELATIVE
+		// r_addend = targ + r.Add
+		//
+		// The dynamic linker will set *offset = base address +
+		// addend.
+		//
+		// AddAddrPlus is used for r_offset and r_addend to
+		// generate new R_ADDR relocations that will update
+		// these fields in the 'reloc' phase.
+		rela := ldr.MakeSymbolUpdater(syms.Rela)
+		rela.AddAddrPlus(target.Arch, s, int64(r.Off()))
+		if r.Siz() == 8 {
+			rela.AddUint64(target.Arch, elf.R_INFO(0, uint32(elf.R_PPC64_RELATIVE)))
+		} else {
+			ldr.Errorf(s, "unexpected relocation for dynamic symbol %s", ldr.SymName(targ))
+		}
+		rela.AddAddrPlus(target.Arch, targ, int64(r.Add()))
+
+		// Not mark r done here. So we still apply it statically,
+		// so in the file content we'll also have the right offset
+		// to the relocation target. So it can be examined statically
+		// (e.g. go version).
 		return true
 	}

-	// TODO(austin): Translate our relocations to ELF
-
 	return false
 }

@ -542,35 +629,40 @@ func symtoc(ldr *loader.Loader, syms *ld.ArchSyms, s loader.Sym) int64 {
 }

 // archreloctoc relocates a TOC relative symbol.
-// If the symbol pointed by this TOC relative symbol is in .data or .bss, the
-// default load instruction can be changed to an addi instruction and the
-// symbol address can be used directly.
-// This code is for AIX only.
 func archreloctoc(ldr *loader.Loader, target *ld.Target, syms *ld.ArchSyms, r loader.Reloc, s loader.Sym, val int64) int64 {
 	rs := r.Sym()
-	if target.IsLinux() {
-		ldr.Errorf(s, "archrelocaddr called for %s relocation\n", ldr.SymName(rs))
-	}
 	var o1, o2 uint32
-
-	o1 = uint32(val >> 32)
-	o2 = uint32(val)
-
-	if !strings.HasPrefix(ldr.SymName(rs), "TOC.") {
-		ldr.Errorf(s, "archreloctoc called for a symbol without TOC anchor")
-	}
 	var t int64
 	useAddi := false
-	relocs := ldr.Relocs(rs)
-	tarSym := relocs.At(0).Sym()

-	if target.IsInternal() && tarSym != 0 && ldr.AttrReachable(tarSym) && ldr.SymSect(tarSym).Seg == &ld.Segdata {
-		t = ldr.SymValue(tarSym) + r.Add() - ldr.SymValue(syms.TOC)
-		// change ld to addi in the second instruction
-		o2 = (o2 & 0x03FF0000) | 0xE<<26
-		useAddi = true
+	if target.IsBigEndian() {
+		o1 = uint32(val >> 32)
+		o2 = uint32(val)
 	} else {
-		t = ldr.SymValue(rs) + r.Add() - ldr.SymValue(syms.TOC)
+		o1 = uint32(val)
+		o2 = uint32(val >> 32)
+	}
+
+	// On AIX, TOC data accesses are always made indirectly against R2 (a sequence of addis+ld+load/store). If the
+	// The target of the load is known, the sequence can be written into addis+addi+load/store. On Linux,
+	// TOC data accesses are always made directly against R2 (e.g addis+load/store).
+	if target.IsAIX() {
+		if !strings.HasPrefix(ldr.SymName(rs), "TOC.") {
+			ldr.Errorf(s, "archreloctoc called for a symbol without TOC anchor")
+		}
+		relocs := ldr.Relocs(rs)
+		tarSym := relocs.At(0).Sym()
+
+		if target.IsInternal() && tarSym != 0 && ldr.AttrReachable(tarSym) && ldr.SymSect(tarSym).Seg == &ld.Segdata {
+			t = ldr.SymValue(tarSym) + r.Add() - ldr.SymValue(syms.TOC)
+			// change ld to addi in the second instruction
+			o2 = (o2 & 0x03FF0000) | 0xE<<26
+			useAddi = true
+		} else {
+			t = ldr.SymValue(rs) + r.Add() - ldr.SymValue(syms.TOC)
+		}
+	} else {
+		t = ldr.SymValue(rs) + r.Add() - symtoc(ldr, syms, s)
 	}

 	if t != int64(int32(t)) {
@ -593,15 +685,20 @@ func archreloctoc(ldr *loader.Loader, target *ld.Target, syms *ld.ArchSyms, r lo
 			}
 			o2 |= uint32(t) & 0xFFFC
 		}
+	case objabi.R_ADDRPOWER_TOCREL:
+		o2 |= uint32(t) & 0xffff
 	default:
 		return -1
 	}

-	return int64(o1)<<32 | int64(o2)
+	if target.IsBigEndian() {
+		return int64(o1)<<32 | int64(o2)
+	}
+	return int64(o2)<<32 | int64(o1)
 }

 // archrelocaddr relocates a symbol address.
-// This code is for AIX only.
+// This code is for linux only.
 func archrelocaddr(ldr *loader.Loader, target *ld.Target, syms *ld.ArchSyms, r loader.Reloc, s loader.Sym, val int64) int64 {
 	rs := r.Sym()
 	if target.IsAIX() {
@ -860,6 +957,18 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade

 		t := ldr.SymValue(rs) + r.Add() - (ldr.SymValue(s) + int64(r.Off()))

+		tgtName := ldr.SymName(rs)
+
+		// If we are linking PIE or shared code, all golang generated object files have an extra 2 instruction prologue
+		// to regenerate the TOC pointer from R12.  The exception are two special case functions tested below.  Note,
+		// local call offsets for externally generated objects are accounted for when converting into golang relocs.
+		if !ldr.IsExternal(rs) && ldr.AttrShared(rs) && tgtName != "runtime.duffzero" && tgtName != "runtime.duffcopy" {
+			// Furthermore, only apply the offset if the target looks like the start of a function call.
+			if r.Add() == 0 && ldr.SymType(rs) == sym.STEXT {
+				t += 8
+			}
+		}
+
 		if t&3 != 0 {
 			ldr.Errorf(s, "relocation for %s+%d is not aligned: %d", ldr.SymName(rs), r.Off(), t)
 		}
@ -872,6 +981,62 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade
 	case objabi.R_POWER_TOC: // S + A - .TOC.
 		return ldr.SymValue(rs) + r.Add() - symtoc(ldr, syms, s), nExtReloc, true

+	case objabi.R_ADDRPOWER_PCREL: // S + A - P
+		t := ldr.SymValue(rs) + r.Add() - (ldr.SymValue(s) + int64(r.Off()))
+		ha := uint16(((t + 0x8000) >> 16) & 0xFFFF)
+		l := uint16(t)
+		if target.IsBigEndian() {
+			val |= int64(l)
+			val |= int64(ha) << 32
+		} else {
+			val |= int64(ha)
+			val |= int64(l) << 32
+		}
+		return val, nExtReloc, true
+
+	case objabi.R_POWER_TLS:
+		const OP_ADD = 31<<26 | 266<<1
+		const MASK_OP_ADD = 0x3F<<26 | 0x1FF<<1
+		if val&MASK_OP_ADD != OP_ADD {
+			ldr.Errorf(s, "R_POWER_TLS reloc only supports XO form ADD, not %08X", val)
+		}
+		// Verify RB is R13 in ADD RA,RB,RT.
+		if (val>>11)&0x1F != 13 {
+			// If external linking is made to support this, it may expect the linker to rewrite RB.
+			ldr.Errorf(s, "R_POWER_TLS reloc requires R13 in RB (%08X).", uint32(val))
+		}
+		return val, nExtReloc, true
+
+	case objabi.R_POWER_TLS_IE:
+		// Convert TLS_IE relocation to TLS_LE if supported.
+		if !(target.IsPIE() && target.IsElf()) {
+			log.Fatalf("cannot handle R_POWER_TLS_IE (sym %s) when linking non-PIE, non-ELF binaries internally", ldr.SymName(s))
+		}
+
+		// We are an ELF binary, we can safely convert to TLS_LE from:
+		// addis to, r2, x@got@tprel@ha
+		// ld to, to, x@got@tprel@l(to)
+		//
+		// to TLS_LE by converting to:
+		// addis to, r0, x@tprel@ha
+		// addi to, to, x@tprel@l(to)
+
+		const OP_ADDI = 14 << 26
+		const OP_MASK = 0x3F << 26
+		const OP_RA_MASK = 0x1F << 16
+		uval := uint64(val)
+		// convert r2 to r0, and ld to addi
+		if target.IsBigEndian() {
+			uval = uval &^ (OP_RA_MASK << 32)
+			uval = (uval &^ OP_MASK) | OP_ADDI
+		} else {
+			uval = uval &^ (OP_RA_MASK)
+			uval = (uval &^ (OP_MASK << 32)) | (OP_ADDI << 32)
+		}
+		val = int64(uval)
+		// Treat this like an R_POWER_TLS_LE relocation now.
+		fallthrough
+
 	case objabi.R_POWER_TLS_LE:
 		// The thread pointer points 0x7000 bytes after the start of the
 		// thread local storage area as documented in section "3.7.2 TLS
--- a/src/crypto/aes/asm_ppc64le.s
+++ b/src/crypto/aes/asm_ppc64le.s
@ -13,8 +13,8 @@
 // Original code can be found at the link below:
 // https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl

-// I changed some function names in order to be more likely to go standards.
-// For instance, function aes_p8_set_{en,de}crypt_key become
+// Some function names were changed to be consistent with Go function
+// names. For instance, function aes_p8_set_{en,de}crypt_key become
 // set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
 // and a new session was created (doEncryptKeyAsm). This was necessary to
 // avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
@ -50,452 +50,451 @@
 #define BLK_ROUNDS R6
 #define BLK_IDX    R7

-DATA  ·rcon+0x00(SB)/8, $0x0100000001000000 // RCON
-DATA  ·rcon+0x08(SB)/8, $0x0100000001000000 // RCON
-DATA  ·rcon+0x10(SB)/8, $0x1b0000001b000000
-DATA  ·rcon+0x18(SB)/8, $0x1b0000001b000000
-DATA  ·rcon+0x20(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
-DATA  ·rcon+0x28(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
-DATA  ·rcon+0x30(SB)/8, $0x0000000000000000
-DATA  ·rcon+0x38(SB)/8, $0x0000000000000000
+DATA ·rcon+0x00(SB)/8, $0x0100000001000000 // RCON
+DATA ·rcon+0x08(SB)/8, $0x0100000001000000 // RCON
+DATA ·rcon+0x10(SB)/8, $0x1b0000001b000000
+DATA ·rcon+0x18(SB)/8, $0x1b0000001b000000
+DATA ·rcon+0x20(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
+DATA ·rcon+0x28(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
+DATA ·rcon+0x30(SB)/8, $0x0000000000000000
+DATA ·rcon+0x38(SB)/8, $0x0000000000000000
 GLOBL ·rcon(SB), RODATA, $64

 // func setEncryptKeyAsm(key *byte, keylen int, enc *uint32) int
-TEXT ·setEncryptKeyAsm(SB),NOSPLIT|NOFRAME,$0
+TEXT ·setEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
 	// Load the arguments inside the registers
-	MOVD key+0(FP), INP
-	MOVD keylen+8(FP), BITS
-	MOVD enc+16(FP), OUT
-	JMP ·doEncryptKeyAsm(SB)
+	MOVD	key+0(FP), INP
+	MOVD	keylen+8(FP), BITS
+	MOVD	enc+16(FP), OUT
+	JMP	·doEncryptKeyAsm(SB)

 // This text is used both setEncryptKeyAsm and setDecryptKeyAsm
-TEXT ·doEncryptKeyAsm(SB),NOSPLIT|NOFRAME,$0
+TEXT ·doEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
 	// Do not change R10 since it's storing the LR value in setDecryptKeyAsm

 	// Check arguments
-	MOVD $-1, PTR                  // li    6,-1       exit code to -1 (255)
-	CMPU INP, $0                   // cmpldi r3,0      input key pointer set?
-	BC 0x0E, 2, enc_key_abort      // beq-  .Lenc_key_abort
-	CMPU OUT, $0                   // cmpldi r5,0      output key pointer set?
-	BC 0x0E, 2, enc_key_abort      // beq-  .Lenc_key_abort
-	MOVD $-2, PTR                  // li    6,-2       exit code to -2 (254)
-	CMPW BITS, $128                // cmpwi 4,128      greater or equal to 128
-	BC 0x0E, 0, enc_key_abort      // blt-  .Lenc_key_abort
-	CMPW BITS, $256                // cmpwi 4,256      lesser or equal to 256
-	BC 0x0E, 1, enc_key_abort      // bgt-  .Lenc_key_abort
-	ANDCC $0x3f, BITS, TEMP        // andi. 0,4,0x3f   multiple of 64
-	BC 0x06, 2, enc_key_abort      // bne-  .Lenc_key_abort
+	MOVD	$-1, PTR               // li    6,-1       exit code to -1 (255)
+	CMPU	INP, $0                // cmpldi r3,0      input key pointer set?
+	BC	0x0E, 2, enc_key_abort // beq-  .Lenc_key_abort
+	CMPU	OUT, $0                // cmpldi r5,0      output key pointer set?
+	BC	0x0E, 2, enc_key_abort // beq-  .Lenc_key_abort
+	MOVD	$-2, PTR               // li    6,-2       exit code to -2 (254)
+	CMPW	BITS, $128             // cmpwi 4,128      greater or equal to 128
+	BC	0x0E, 0, enc_key_abort // blt-  .Lenc_key_abort
+	CMPW	BITS, $256             // cmpwi 4,256      lesser or equal to 256
+	BC	0x0E, 1, enc_key_abort // bgt-  .Lenc_key_abort
+	ANDCC	$0x3f, BITS, TEMP      // andi. 0,4,0x3f   multiple of 64
+	BC	0x06, 2, enc_key_abort // bne-  .Lenc_key_abort

-	MOVD $·rcon(SB), PTR           // PTR point to rcon addr
+	MOVD	$·rcon(SB), PTR // PTR point to rcon addr

 	// Get key from memory and write aligned into VR
-	NEG INP, R9                    // neg   9,3        R9 is ~INP + 1
-	LVX (INP)(R0), IN0             // lvx   1,0,3      Load key inside IN0
-	ADD $15, INP, INP              // addi  3,3,15     Add 15B to INP addr
-	LVSR (R9)(R0), KEY             // lvsr  3,0,9
-	MOVD $0x20, R8                 // li    8,0x20     R8 = 32
-	CMPW BITS, $192                // cmpwi 4,192      Key size == 192?
-	LVX (INP)(R0), IN1             // lvx   2,0,3
-	VSPLTISB $0x0f, MASK           // vspltisb 5,0x0f  0x0f0f0f0f... mask
-	LVX (PTR)(R0), RCON            // lvx   4,0,6      Load first 16 bytes into RCON
-	VXOR KEY, MASK, KEY            // vxor  3,3,5      Adjust for byte swap
-	LVX (PTR)(R8), MASK            // lvx   5,8,6
-	ADD $0x10, PTR, PTR            // addi  6,6,0x10   PTR to next 16 bytes of RCON
-	VPERM IN0, IN1, KEY, IN0       // vperm 1,1,2,3    Align
-	MOVD $8, CNT                   // li    7,8        CNT = 8
-	VXOR ZERO, ZERO, ZERO          // vxor  0,0,0      Zero to be zero :)
-	MOVD CNT, CTR                  // mtctr 7          Set the counter to 8 (rounds)
+	NEG	INP, R9            // neg   9,3        R9 is ~INP + 1
+	LVX	(INP)(R0), IN0     // lvx   1,0,3      Load key inside IN0
+	ADD	$15, INP, INP      // addi  3,3,15     Add 15B to INP addr
+	LVSR	(R9)(R0), KEY      // lvsr  3,0,9
+	MOVD	$0x20, R8          // li    8,0x20     R8 = 32
+	CMPW	BITS, $192         // cmpwi 4,192      Key size == 192?
+	LVX	(INP)(R0), IN1     // lvx   2,0,3
+	VSPLTISB	$0x0f, MASK// vspltisb 5,0x0f  0x0f0f0f0f... mask
+	LVX	(PTR)(R0), RCON    // lvx   4,0,6      Load first 16 bytes into RCON
+	VXOR	KEY, MASK, KEY     // vxor  3,3,5      Adjust for byte swap
+	LVX	(PTR)(R8), MASK    // lvx   5,8,6
+	ADD	$0x10, PTR, PTR    // addi  6,6,0x10   PTR to next 16 bytes of RCON
+	VPERM	IN0, IN1, KEY, IN0 // vperm 1,1,2,3    Align
+	MOVD	$8, CNT            // li    7,8        CNT = 8
+	VXOR	ZERO, ZERO, ZERO   // vxor  0,0,0      Zero to be zero :)
+	MOVD	CNT, CTR           // mtctr 7          Set the counter to 8 (rounds)

-	LVSL (OUT)(R0), OUTPERM        // lvsl  8,0,5
-	VSPLTISB $-1, OUTMASK          // vspltisb      9,-1
-	LVX (OUT)(R0), OUTHEAD         // lvx   10,0,5
-	VPERM OUTMASK, ZERO, OUTPERM, OUTMASK  // vperm 9,9,0,8
+	LVSL	(OUT)(R0), OUTPERM              // lvsl  8,0,5
+	VSPLTISB	$-1, OUTMASK                    // vspltisb      9,-1
+	LVX	(OUT)(R0), OUTHEAD              // lvx   10,0,5
+	VPERM	OUTMASK, ZERO, OUTPERM, OUTMASK // vperm 9,9,0,8

-	BLT loop128                    // blt   .Loop128
-	ADD $8, INP, INP               // addi  3,3,8
-	BEQ l192                       // beq   .L192
-	ADD $8, INP, INP               // addi  3,3,8
-	JMP l256                       // b     .L256
+	BLT	loop128      // blt   .Loop128
+	ADD	$8, INP, INP // addi  3,3,8
+	BEQ	l192         // beq   .L192
+	ADD	$8, INP, INP // addi  3,3,8
+	JMP	l256         // b     .L256

 loop128:
 	// Key schedule (Round 1 to 8)
-	VPERM IN0, IN0, MASK, KEY      // vperm 3,1,1,5         Rotate-n-splat
-	VSLDOI $12, ZERO, IN0, TMP     // vsldoi 6,0,1,12
-	VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8    Rotate
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	VCIPHERLAST KEY, RCON, KEY     // vcipherlast 3,3,4
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5        Write to output
-	ADD $16, OUT, OUT              // addi 5,5,16       Point to the next round
+	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5         Rotate-n-splat
+	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
+	VPERM	IN0, IN0, OUTPERM, OUTTAIL       // vperm 11,1,1,8    Rotate
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5        Write to output
+	ADD	$16, OUT, OUT                    // addi 5,5,16       Point to the next round

-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VADDUWM RCON, RCON, RCON       // vadduwm 4,4,4
-	VXOR IN0, KEY, IN0             // vxor 1,1,3
-	BC 0x10, 0, loop128            // bdnz .Loop128
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6
+	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
+	VXOR	IN0, KEY, IN0       // vxor 1,1,3
+	BC	0x10, 0, loop128    // bdnz .Loop128

-	LVX (PTR)(R0), RCON            // lvx 4,0,6     Last two round keys
+	LVX	(PTR)(R0), RCON // lvx 4,0,6     Last two round keys

 	// Key schedule (Round 9)
-	VPERM IN0, IN0, MASK, KEY      // vperm 3,1,1,5   Rotate-n-spat
-	VSLDOI $12, ZERO, IN0, TMP     // vsldoi 6,0,1,12
-	VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8  Rotate
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	VCIPHERLAST KEY, RCON, KEY     // vcipherlast 3,3,4
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5   Round 9
-	ADD $16, OUT, OUT              // addi 5,5,16
+	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-spat
+	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
+	VPERM	IN0, IN0, OUTPERM, OUTTAIL       // vperm 11,1,1,8  Rotate
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5   Round 9
+	ADD	$16, OUT, OUT                    // addi 5,5,16

 	// Key schedule (Round 10)
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VADDUWM RCON, RCON, RCON       // vadduwm 4,4,4
-	VXOR IN0, KEY, IN0             // vxor 1,1,3
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6
+	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
+	VXOR	IN0, KEY, IN0       // vxor 1,1,3

-	VPERM IN0, IN0, MASK, KEY      // vperm 3,1,1,5   Rotate-n-splat
-	VSLDOI $12, ZERO, IN0, TMP     // vsldoi 6,0,1,12
-	VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8  Rotate
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	VCIPHERLAST KEY, RCON, KEY     // vcipherlast 3,3,4
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5    Round 10
-	ADD $16, OUT, OUT              // addi 5,5,16
+	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-splat
+	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
+	VPERM	IN0, IN0, OUTPERM, OUTTAIL       // vperm 11,1,1,8  Rotate
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5    Round 10
+	ADD	$16, OUT, OUT                    // addi 5,5,16

 	// Key schedule (Round 11)
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VXOR IN0, KEY, IN0             // vxor 1,1,3
-	VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5  Round 11
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
+	VPERM	IN0, IN0, OUTPERM, OUTTAIL       // vperm 11,1,1,8
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5  Round 11

-	ADD $15, OUT, INP              // addi  3,5,15
-	ADD $0x50, OUT, OUT            // addi  5,5,0x50
+	ADD	$15, OUT, INP   // addi  3,5,15
+	ADD	$0x50, OUT, OUT // addi  5,5,0x50

-	MOVD $10, ROUNDS               // li    8,10
-	JMP done                       // b     .Ldone
+	MOVD	$10, ROUNDS // li    8,10
+	JMP	done        // b     .Ldone

 l192:
-	LVX (INP)(R0), TMP             // lvx 6,0,3
-	MOVD $4, CNT                   // li 7,4
-	VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5
-	ADD $16, OUT, OUT              // addi 5,5,16
-	VPERM IN1, TMP, KEY, IN1       // vperm 2,2,6,3
-	VSPLTISB $8, KEY               // vspltisb 3,8
-	MOVD CNT, CTR                  // mtctr 7
-	VSUBUBM MASK, KEY, MASK        // vsububm 5,5,3
+	LVX	(INP)(R0), TMP                   // lvx 6,0,3
+	MOVD	$4, CNT                          // li 7,4
+	VPERM	IN0, IN0, OUTPERM, OUTTAIL       // vperm 11,1,1,8
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5
+	ADD	$16, OUT, OUT                    // addi 5,5,16
+	VPERM	IN1, TMP, KEY, IN1               // vperm 2,2,6,3
+	VSPLTISB	$8, KEY                  // vspltisb 3,8
+	MOVD	CNT, CTR                         // mtctr 7
+	VSUBUBM	MASK, KEY, MASK                  // vsububm 5,5,3

 loop192:
-	VPERM IN1, IN1, MASK, KEY      // vperm 3,2,2,5
-	VSLDOI $12, ZERO, IN0, TMP     // vsldoi 6,0,1,12
-	VCIPHERLAST KEY, RCON, KEY     // vcipherlast 3,3,4
+	VPERM	IN1, IN1, MASK, KEY // vperm 3,2,2,5
+	VSLDOI	$12, ZERO, IN0, TMP // vsldoi 6,0,1,12
+	VCIPHERLAST	KEY, RCON, KEY      // vcipherlast 3,3,4

-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0       // vxor 1,1,6

-	VSLDOI $8, ZERO, IN1, STAGE    // vsldoi 7,0,2,8
-	VSPLTW $3, IN0, TMP            // vspltw 6,1,3
-	VXOR TMP, IN1, TMP             // vxor 6,6,2
-	VSLDOI $12, ZERO, IN1, IN1     // vsldoi 2,0,2,12
-	VADDUWM RCON, RCON, RCON       // vadduwm 4,4,4
-	VXOR IN1, TMP, IN1             // vxor 2,2,6
-	VXOR IN0, KEY, IN0             // vxor 1,1,3
-	VXOR IN1, KEY, IN1             // vxor 2,2,3
-	VSLDOI $8, STAGE, IN0, STAGE   // vsldoi 7,7,1,8
+	VSLDOI	$8, ZERO, IN1, STAGE  // vsldoi 7,0,2,8
+	VSPLTW	$3, IN0, TMP          // vspltw 6,1,3
+	VXOR	TMP, IN1, TMP         // vxor 6,6,2
+	VSLDOI	$12, ZERO, IN1, IN1   // vsldoi 2,0,2,12
+	VADDUWM	RCON, RCON, RCON      // vadduwm 4,4,4
+	VXOR	IN1, TMP, IN1         // vxor 2,2,6
+	VXOR	IN0, KEY, IN0         // vxor 1,1,3
+	VXOR	IN1, KEY, IN1         // vxor 2,2,3
+	VSLDOI	$8, STAGE, IN0, STAGE // vsldoi 7,7,1,8

-	VPERM IN1, IN1, MASK, KEY      // vperm 3,2,2,5
-	VSLDOI $12, ZERO, IN0, TMP     // vsldoi 6,0,1,12
-	VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	VCIPHERLAST KEY, RCON, KEY     // vcipherlast 3,3,4
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5
-	ADD $16, OUT, OUT              // addi 5,5,16
+	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
+	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
+	VPERM	STAGE, STAGE, OUTPERM, OUTTAIL   // vperm 11,7,7,8
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5
+	ADD	$16, OUT, OUT                    // addi 5,5,16

-	VSLDOI $8, IN0, IN1, STAGE     // vsldoi 7,1,2,8
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5
-	ADD $16, OUT, OUT              // addi 5,5,16
+	VSLDOI	$8, IN0, IN1, STAGE              // vsldoi 7,1,2,8
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
+	VPERM	STAGE, STAGE, OUTPERM, OUTTAIL   // vperm 11,7,7,8
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5
+	ADD	$16, OUT, OUT                    // addi 5,5,16

-	VSPLTW $3, IN0, TMP            // vspltw 6,1,3
-	VXOR TMP, IN1, TMP             // vxor 6,6,2
-	VSLDOI $12, ZERO, IN1, IN1     // vsldoi 2,0,2,12
-	VADDUWM RCON, RCON, RCON       // vadduwm 4,4,4
-	VXOR IN1, TMP, IN1             // vxor 2,2,6
-	VXOR IN0, KEY, IN0             // vxor 1,1,3
-	VXOR IN1, KEY, IN1             // vxor 2,2,3
-	VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5
-	ADD $15, OUT, INP              // addi 3,5,15
-	ADD $16, OUT, OUT              // addi 5,5,16
-	BC 0x10, 0, loop192           // bdnz .Loop192
+	VSPLTW	$3, IN0, TMP                     // vspltw 6,1,3
+	VXOR	TMP, IN1, TMP                    // vxor 6,6,2
+	VSLDOI	$12, ZERO, IN1, IN1              // vsldoi 2,0,2,12
+	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
+	VXOR	IN1, TMP, IN1                    // vxor 2,2,6
+	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
+	VXOR	IN1, KEY, IN1                    // vxor 2,2,3
+	VPERM	IN0, IN0, OUTPERM, OUTTAIL       // vperm 11,1,1,8
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5
+	ADD	$15, OUT, INP                    // addi 3,5,15
+	ADD	$16, OUT, OUT                    // addi 5,5,16
+	BC	0x10, 0, loop192                 // bdnz .Loop192

-	MOVD $12, ROUNDS               // li 8,12
-	ADD $0x20, OUT, OUT            // addi 5,5,0x20
-	JMP done                       // b .Ldone
+	MOVD	$12, ROUNDS     // li 8,12
+	ADD	$0x20, OUT, OUT // addi 5,5,0x20
+	BR	done            // b .Ldone

 l256:
-	LVX (INP)(R0), TMP             // lvx 6,0,3
-	MOVD $7, CNT                   // li 7,7
-	MOVD $14, ROUNDS               // li 8,14
-	VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5
-	ADD $16, OUT, OUT              // addi 5,5,16
-	VPERM IN1, TMP, KEY, IN1       // vperm 2,2,6,3
-	MOVD CNT, CTR                  // mtctr 7
+	LVX	(INP)(R0), TMP                   // lvx 6,0,3
+	MOVD	$7, CNT                          // li 7,7
+	MOVD	$14, ROUNDS                      // li 8,14
+	VPERM	IN0, IN0, OUTPERM, OUTTAIL       // vperm 11,1,1,8
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5
+	ADD	$16, OUT, OUT                    // addi 5,5,16
+	VPERM	IN1, TMP, KEY, IN1               // vperm 2,2,6,3
+	MOVD	CNT, CTR                         // mtctr 7

 loop256:
-	VPERM IN1, IN1, MASK, KEY      // vperm 3,2,2,5
-	VSLDOI $12, ZERO, IN0, TMP     // vsldoi 6,0,1,12
-	VPERM IN1, IN1, OUTPERM, OUTTAIL // vperm 11,2,2,8
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	VCIPHERLAST KEY, RCON, KEY     // vcipherlast 3,3,4
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5
-	ADD $16, OUT, OUT              // addi 5,5,16
+	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
+	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
+	VPERM	IN1, IN1, OUTPERM, OUTTAIL       // vperm 11,2,2,8
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5
+	ADD	$16, OUT, OUT                    // addi 5,5,16

-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN0, TMP, IN0             // vxor 1,1,6
-	VADDUWM RCON, RCON, RCON       // vadduwm 4,4,4
-	VXOR IN0, KEY, IN0             // vxor 1,1,3
-	VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
-	VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
-	VOR OUTTAIL, OUTTAIL, OUTHEAD  // vor 10,11,11
-	STVX STAGE, (OUT+R0)           // stvx 7,0,5
-	ADD $15, OUT, INP              // addi 3,5,15
-	ADD $16, OUT, OUT              // addi 5,5,16
-	BC 0x12, 0, done               // bdz .Ldone
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
+	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
+	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
+	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
+	VPERM	IN0, IN0, OUTPERM, OUTTAIL       // vperm 11,1,1,8
+	VSEL	OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
+	VOR	OUTTAIL, OUTTAIL, OUTHEAD        // vor 10,11,11
+	STVX	STAGE, (OUT+R0)                  // stvx 7,0,5
+	ADD	$15, OUT, INP                    // addi 3,5,15
+	ADD	$16, OUT, OUT                    // addi 5,5,16
+	BC	0x12, 0, done                    // bdz .Ldone

-	VSPLTW $3, IN0, KEY            // vspltw 3,1,3
-	VSLDOI $12, ZERO, IN1, TMP     // vsldoi 6,0,2,12
-	VSBOX KEY, KEY                 // vsbox 3,3
+	VSPLTW	$3, IN0, KEY        // vspltw 3,1,3
+	VSLDOI	$12, ZERO, IN1, TMP // vsldoi 6,0,2,12
+	VSBOX	KEY, KEY            // vsbox 3,3

-	VXOR IN1, TMP, IN1             // vxor 2,2,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN1, TMP, IN1             // vxor 2,2,6
-	VSLDOI $12, ZERO, TMP, TMP     // vsldoi 6,0,6,12
-	VXOR IN1, TMP, IN1             // vxor 2,2,6
+	VXOR	IN1, TMP, IN1       // vxor 2,2,6
+	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
+	VXOR	IN1, TMP, IN1       // vxor 2,2,6
+	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
+	VXOR	IN1, TMP, IN1       // vxor 2,2,6

-	VXOR IN1, KEY, IN1             // vxor 2,2,3
-	JMP loop256                    // b .Loop256
+	VXOR	IN1, KEY, IN1 // vxor 2,2,3
+	JMP	loop256       // b .Loop256

 done:
-	LVX (INP)(R0), IN1             // lvx   2,0,3
-	VSEL OUTHEAD, IN1, OUTMASK, IN1 // vsel 2,10,2,9
-	STVX IN1, (INP+R0)             // stvx  2,0,3
-	MOVD $0, PTR                   // li    6,0    set PTR to 0 (exit code 0)
-	MOVW ROUNDS, 0(OUT)            // stw   8,0(5)
+	LVX	(INP)(R0), IN1             // lvx   2,0,3
+	VSEL	OUTHEAD, IN1, OUTMASK, IN1 // vsel 2,10,2,9
+	STVX	IN1, (INP+R0)              // stvx  2,0,3
+	MOVD	$0, PTR                    // li    6,0    set PTR to 0 (exit code 0)
+	MOVW	ROUNDS, 0(OUT)             // stw   8,0(5)

 enc_key_abort:
-	MOVD PTR, INP                  // mr    3,6    set exit code with PTR value
-	MOVD INP, ret+24(FP)           // Put return value into the FP
-	RET                            // blr
+	MOVD	PTR, INP        // mr    3,6    set exit code with PTR value
+	MOVD	INP, ret+24(FP) // Put return value into the FP
+	RET                  // blr

 // func setDecryptKeyAsm(key *byte, keylen int, dec *uint32) int
-TEXT ·setDecryptKeyAsm(SB),NOSPLIT|NOFRAME,$0
+TEXT ·setDecryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
 	// Load the arguments inside the registers
-	MOVD key+0(FP), INP
-	MOVD keylen+8(FP), BITS
-	MOVD dec+16(FP), OUT
+	MOVD	key+0(FP), INP
+	MOVD	keylen+8(FP), BITS
+	MOVD	dec+16(FP), OUT

-	MOVD LR, R10                   // mflr 10
-	CALL ·doEncryptKeyAsm(SB)
-	MOVD R10, LR                   // mtlr 10
+	MOVD	LR, R10              // mflr 10
+	CALL	·doEncryptKeyAsm(SB)
+	MOVD	R10, LR              // mtlr 10

-	CMPW INP, $0                   // cmpwi 3,0  exit 0 = ok
-	BC 0x06, 2, dec_key_abort      // bne- .Ldec_key_abort
+	CMPW	INP, $0                // cmpwi 3,0  exit 0 = ok
+	BC	0x06, 2, dec_key_abort // bne- .Ldec_key_abort

 	// doEncryptKeyAsm set ROUNDS (R8) with the proper value for each mode
-	SLW $4, ROUNDS, CNT            // slwi 7,8,4
-	SUB $240, OUT, INP             // subi 3,5,240
-	SRW $1, ROUNDS, ROUNDS         // srwi 8,8,1
-	ADD R7, INP, OUT               // add 5,3,7
-	MOVD ROUNDS, CTR               // mtctr 8
+	SLW	$4, ROUNDS, CNT    // slwi 7,8,4
+	SUB	$240, OUT, INP     // subi 3,5,240
+	SRW	$1, ROUNDS, ROUNDS // srwi 8,8,1
+	ADD	R7, INP, OUT       // add 5,3,7
+	MOVD	ROUNDS, CTR        // mtctr 8

-// dec_key will invert the key sequence in order to be used for decrypt
+	// dec_key will invert the key sequence in order to be used for decrypt
 dec_key:
-	MOVWZ 0(INP), TEMP             // lwz 0, 0(3)
-	MOVWZ 4(INP), R6               // lwz 6, 4(3)
-	MOVWZ 8(INP), R7               // lwz 7, 8(3)
-	MOVWZ 12(INP), R8              // lwz 8, 12(3)
-	ADD $16, INP, INP              // addi 3,3,16
-	MOVWZ 0(OUT), R9               // lwz 9, 0(5)
-	MOVWZ 4(OUT), R10              // lwz 10,4(5)
-	MOVWZ 8(OUT), R11              // lwz 11,8(5)
-	MOVWZ 12(OUT), R12             // lwz 12,12(5)
-	MOVW TEMP, 0(OUT)              // stw 0, 0(5)
-	MOVW R6, 4(OUT)                // stw 6, 4(5)
-	MOVW R7, 8(OUT)                // stw 7, 8(5)
-	MOVW R8, 12(OUT)               // stw 8, 12(5)
-	SUB $16, OUT, OUT              // subi 5,5,16
-	MOVW R9, -16(INP)              // stw 9, -16(3)
-	MOVW R10, -12(INP)             // stw 10,-12(3)
-	MOVW R11, -8(INP)              // stw 11,-8(3)
-	MOVW R12, -4(INP)              // stw 12,-4(3)
-	BC 0x10, 0, dec_key            // bdnz .Ldeckey
+	MOVWZ	0(INP), TEMP     // lwz 0, 0(3)
+	MOVWZ	4(INP), R6       // lwz 6, 4(3)
+	MOVWZ	8(INP), R7       // lwz 7, 8(3)
+	MOVWZ	12(INP), R8      // lwz 8, 12(3)
+	ADD	$16, INP, INP    // addi 3,3,16
+	MOVWZ	0(OUT), R9       // lwz 9, 0(5)
+	MOVWZ	4(OUT), R10      // lwz 10,4(5)
+	MOVWZ	8(OUT), R11      // lwz 11,8(5)
+	MOVWZ	12(OUT), R12     // lwz 12,12(5)
+	MOVW	TEMP, 0(OUT)     // stw 0, 0(5)
+	MOVW	R6, 4(OUT)       // stw 6, 4(5)
+	MOVW	R7, 8(OUT)       // stw 7, 8(5)
+	MOVW	R8, 12(OUT)      // stw 8, 12(5)
+	SUB	$16, OUT, OUT    // subi 5,5,16
+	MOVW	R9, -16(INP)     // stw 9, -16(3)
+	MOVW	R10, -12(INP)    // stw 10,-12(3)
+	MOVW	R11, -8(INP)     // stw 11,-8(3)
+	MOVW	R12, -4(INP)     // stw 12,-4(3)
+	BC	0x10, 0, dec_key // bdnz .Ldeckey

-	XOR R3, R3, R3                 // xor 3,3,3      Clean R3
+	XOR	R3, R3, R3 // xor 3,3,3      Clean R3

 dec_key_abort:
-	MOVD R3, ret+24(FP)            // Put return value into the FP
-	RET                            // blr
-
+	MOVD	R3, ret+24(FP) // Put return value into the FP
+	RET                 // blr

 // func encryptBlockAsm(dst, src *byte, enc *uint32)
-TEXT ·encryptBlockAsm(SB),NOSPLIT|NOFRAME,$0
+TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
 	// Load the arguments inside the registers
-	MOVD dst+0(FP), BLK_OUT
-	MOVD src+8(FP), BLK_INP
-	MOVD enc+16(FP), BLK_KEY
+	MOVD	dst+0(FP), BLK_OUT
+	MOVD	src+8(FP), BLK_INP
+	MOVD	enc+16(FP), BLK_KEY

-	MOVWZ 240(BLK_KEY), BLK_ROUNDS // lwz 6,240(5)
-	MOVD $15, BLK_IDX              // li 7,15
+	MOVWZ	240(BLK_KEY), BLK_ROUNDS // lwz 6,240(5)
+	MOVD	$15, BLK_IDX             // li 7,15

-	LVX (BLK_INP)(R0), ZERO        // lvx 0,0,3
-	NEG BLK_OUT, R11               // neg 11,4
-	LVX (BLK_INP)(BLK_IDX), IN0    // lvx 1,7,3
-	LVSL (BLK_INP)(R0), IN1        // lvsl 2,0,3
-	VSPLTISB $0x0f, RCON           // vspltisb 4,0x0f
-	LVSR (R11)(R0), KEY            // lvsr 3,0,11
-	VXOR IN1, RCON, IN1            // vxor 2,2,4
-	MOVD $16, BLK_IDX              // li 7,16
-	VPERM ZERO, IN0, IN1, ZERO     // vperm 0,0,1,2
-	LVX (BLK_KEY)(R0), IN0         // lvx 1,0,5
-	LVSR (BLK_KEY)(R0), MASK       // lvsr 5,0,5
-	SRW $1, BLK_ROUNDS, BLK_ROUNDS // srwi 6,6,1
-	LVX (BLK_KEY)(BLK_IDX), IN1    // lvx 2,7,5
-	ADD $16, BLK_IDX, BLK_IDX      // addi 7,7,16
-	SUB $1, BLK_ROUNDS, BLK_ROUNDS // subi 6,6,1
-	VPERM IN1, IN0, MASK, IN0      // vperm 1,2,1,5
+	LVX	(BLK_INP)(R0), ZERO        // lvx 0,0,3
+	NEG	BLK_OUT, R11               // neg 11,4
+	LVX	(BLK_INP)(BLK_IDX), IN0    // lvx 1,7,3
+	LVSL	(BLK_INP)(R0), IN1         // lvsl 2,0,3
+	VSPLTISB	$0x0f, RCON        // vspltisb 4,0x0f
+	LVSR	(R11)(R0), KEY             // lvsr 3,0,11
+	VXOR	IN1, RCON, IN1             // vxor 2,2,4
+	MOVD	$16, BLK_IDX               // li 7,16
+	VPERM	ZERO, IN0, IN1, ZERO       // vperm 0,0,1,2
+	LVX	(BLK_KEY)(R0), IN0         // lvx 1,0,5
+	LVSR	(BLK_KEY)(R0), MASK        // lvsr 5,0,5
+	SRW	$1, BLK_ROUNDS, BLK_ROUNDS // srwi 6,6,1
+	LVX	(BLK_KEY)(BLK_IDX), IN1    // lvx 2,7,5
+	ADD	$16, BLK_IDX, BLK_IDX      // addi 7,7,16
+	SUB	$1, BLK_ROUNDS, BLK_ROUNDS // subi 6,6,1
+	VPERM	IN1, IN0, MASK, IN0        // vperm 1,2,1,5

-	VXOR ZERO, IN0, ZERO           // vxor 0,0,1
-	LVX (BLK_KEY)(BLK_IDX), IN0    // lvx 1,7,5
-	ADD $16, BLK_IDX, BLK_IDX      // addi 7,7,16
-	MOVD BLK_ROUNDS, CTR           // mtctr 6
+	VXOR	ZERO, IN0, ZERO         // vxor 0,0,1
+	LVX	(BLK_KEY)(BLK_IDX), IN0 // lvx 1,7,5
+	ADD	$16, BLK_IDX, BLK_IDX   // addi 7,7,16
+	MOVD	BLK_ROUNDS, CTR         // mtctr 6

 loop_enc:
-	VPERM IN0, IN1, MASK, IN1      // vperm 2,1,2,5
-	VCIPHER ZERO, IN1, ZERO        // vcipher 0,0,2
-	LVX (BLK_KEY)(BLK_IDX), IN1    // lvx 2,7,5
-	ADD $16, BLK_IDX, BLK_IDX      // addi 7,7,16
-	VPERM IN1, IN0, MASK, IN0      // vperm 1,2,1,5
-	VCIPHER ZERO, IN0, ZERO        // vcipher 0,0,1
-	LVX (BLK_KEY)(BLK_IDX), IN0    // lvx 1,7,5
-	ADD $16, BLK_IDX, BLK_IDX      // addi 7,7,16
-	BC 0x10, 0, loop_enc           // bdnz .Loop_enc
+	VPERM	IN0, IN1, MASK, IN1     // vperm 2,1,2,5
+	VCIPHER	ZERO, IN1, ZERO         // vcipher 0,0,2
+	LVX	(BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
+	ADD	$16, BLK_IDX, BLK_IDX   // addi 7,7,16
+	VPERM	IN1, IN0, MASK, IN0     // vperm 1,2,1,5
+	VCIPHER	ZERO, IN0, ZERO         // vcipher 0,0,1
+	LVX	(BLK_KEY)(BLK_IDX), IN0 // lvx 1,7,5
+	ADD	$16, BLK_IDX, BLK_IDX   // addi 7,7,16
+	BC	0x10, 0, loop_enc       // bdnz .Loop_enc

-	VPERM IN0, IN1, MASK, IN1      // vperm 2,1,2,5
-	VCIPHER ZERO, IN1, ZERO        // vcipher 0,0,2
-	LVX (BLK_KEY)(BLK_IDX), IN1    // lvx 2,7,5
-	VPERM IN1, IN0, MASK, IN0      // vperm 1,2,1,5
-	VCIPHERLAST ZERO, IN0, ZERO    // vcipherlast 0,0,1
+	VPERM	IN0, IN1, MASK, IN1     // vperm 2,1,2,5
+	VCIPHER	ZERO, IN1, ZERO         // vcipher 0,0,2
+	LVX	(BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
+	VPERM	IN1, IN0, MASK, IN0     // vperm 1,2,1,5
+	VCIPHERLAST	ZERO, IN0, ZERO // vcipherlast 0,0,1

-	VSPLTISB $-1, IN1              // vspltisb 2,-1
-	VXOR IN0, IN0, IN0             // vxor 1,1,1
-	MOVD $15, BLK_IDX              // li 7,15
-	VPERM IN1, IN0, KEY, IN1       // vperm 2,2,1,3
-	VXOR KEY, RCON, KEY            // vxor 3,3,4
-	LVX (BLK_OUT)(R0), IN0         // lvx 1,0,4
-	VPERM ZERO, ZERO, KEY, ZERO    // vperm 0,0,0,3
-	VSEL IN0, ZERO, IN1, IN0       // vsel 1,1,0,2
-	LVX (BLK_OUT)(BLK_IDX), RCON   // lvx 4,7,4
-	STVX IN0, (BLK_OUT+R0)         // stvx 1,0,4
-	VSEL ZERO, RCON, IN1, ZERO     // vsel 0,0,4,2
-	STVX ZERO, (BLK_OUT+BLK_IDX)   // stvx 0,7,4
-
-	RET                            // blr
+	VSPLTISB	$-1, IN1         // vspltisb 2,-1
+	VXOR	IN0, IN0, IN0            // vxor 1,1,1
+	MOVD	$15, BLK_IDX             // li 7,15
+	VPERM	IN1, IN0, KEY, IN1       // vperm 2,2,1,3
+	VXOR	KEY, RCON, KEY           // vxor 3,3,4
+	LVX	(BLK_OUT)(R0), IN0       // lvx 1,0,4
+	VPERM	ZERO, ZERO, KEY, ZERO    // vperm 0,0,0,3
+	VSEL	IN0, ZERO, IN1, IN0      // vsel 1,1,0,2
+	LVX	(BLK_OUT)(BLK_IDX), RCON // lvx 4,7,4
+	STVX	IN0, (BLK_OUT+R0)        // stvx 1,0,4
+	VSEL	ZERO, RCON, IN1, ZERO    // vsel 0,0,4,2
+	STVX	ZERO, (BLK_OUT+BLK_IDX)  // stvx 0,7,4

+	RET // blr

 // func decryptBlockAsm(dst, src *byte, dec *uint32)
-TEXT ·decryptBlockAsm(SB),NOSPLIT|NOFRAME,$0
+TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
 	// Load the arguments inside the registers
-	MOVD dst+0(FP), BLK_OUT
-	MOVD src+8(FP), BLK_INP
-	MOVD dec+16(FP), BLK_KEY
+	MOVD	dst+0(FP), BLK_OUT
+	MOVD	src+8(FP), BLK_INP
+	MOVD	dec+16(FP), BLK_KEY

-	MOVWZ 240(BLK_KEY), BLK_ROUNDS // lwz 6,240(5)
-	MOVD $15, BLK_IDX              // li 7,15
+	MOVWZ	240(BLK_KEY), BLK_ROUNDS // lwz 6,240(5)
+	MOVD	$15, BLK_IDX             // li 7,15

-	LVX (BLK_INP)(R0), ZERO        // lvx 0,0,3
-	NEG BLK_OUT, R11               // neg 11,4
-	LVX (BLK_INP)(BLK_IDX), IN0    // lvx 1,7,3
-	LVSL (BLK_INP)(R0), IN1        // lvsl 2,0,3
-	VSPLTISB $0x0f, RCON           // vspltisb 4,0x0f
-	LVSR (R11)(R0), KEY            // lvsr 3,0,11
-	VXOR IN1, RCON, IN1            // vxor 2,2,4
-	MOVD $16, BLK_IDX              // li 7,16
-	VPERM ZERO, IN0, IN1, ZERO     // vperm 0,0,1,2
-	LVX (BLK_KEY)(R0), IN0         // lvx 1,0,5
-	LVSR (BLK_KEY)(R0), MASK       // lvsr 5,0,5
-	SRW $1, BLK_ROUNDS, BLK_ROUNDS // srwi 6,6,1
-	LVX (BLK_KEY)(BLK_IDX), IN1    // lvx 2,7,5
-	ADD $16, BLK_IDX, BLK_IDX      // addi 7,7,16
-	SUB $1, BLK_ROUNDS, BLK_ROUNDS // subi 6,6,1
-	VPERM IN1, IN0, MASK, IN0      // vperm 1,2,1,5
+	LVX	(BLK_INP)(R0), ZERO        // lvx 0,0,3
+	NEG	BLK_OUT, R11               // neg 11,4
+	LVX	(BLK_INP)(BLK_IDX), IN0    // lvx 1,7,3
+	LVSL	(BLK_INP)(R0), IN1         // lvsl 2,0,3
+	VSPLTISB	$0x0f, RCON        // vspltisb 4,0x0f
+	LVSR	(R11)(R0), KEY             // lvsr 3,0,11
+	VXOR	IN1, RCON, IN1             // vxor 2,2,4
+	MOVD	$16, BLK_IDX               // li 7,16
+	VPERM	ZERO, IN0, IN1, ZERO       // vperm 0,0,1,2
+	LVX	(BLK_KEY)(R0), IN0         // lvx 1,0,5
+	LVSR	(BLK_KEY)(R0), MASK        // lvsr 5,0,5
+	SRW	$1, BLK_ROUNDS, BLK_ROUNDS // srwi 6,6,1
+	LVX	(BLK_KEY)(BLK_IDX), IN1    // lvx 2,7,5
+	ADD	$16, BLK_IDX, BLK_IDX      // addi 7,7,16
+	SUB	$1, BLK_ROUNDS, BLK_ROUNDS // subi 6,6,1
+	VPERM	IN1, IN0, MASK, IN0        // vperm 1,2,1,5

-	VXOR ZERO, IN0, ZERO           // vxor 0,0,1
-	LVX (BLK_KEY)(BLK_IDX), IN0    // lvx 1,7,5
-	ADD $16, BLK_IDX, BLK_IDX      // addi 7,7,16
-	MOVD BLK_ROUNDS, CTR           // mtctr 6
+	VXOR	ZERO, IN0, ZERO         // vxor 0,0,1
+	LVX	(BLK_KEY)(BLK_IDX), IN0 // lvx 1,7,5
+	ADD	$16, BLK_IDX, BLK_IDX   // addi 7,7,16
+	MOVD	BLK_ROUNDS, CTR         // mtctr 6

 loop_dec:
-	VPERM IN0, IN1, MASK, IN1      // vperm 2,1,2,5
-	VNCIPHER ZERO, IN1, ZERO       // vncipher 0,0,2
-	LVX (BLK_KEY)(BLK_IDX), IN1    // lvx 2,7,5
-	ADD $16, BLK_IDX, BLK_IDX      // addi 7,7,16
-	VPERM IN1, IN0, MASK, IN0      // vperm 1,2,1,5
-	VNCIPHER ZERO, IN0, ZERO       // vncipher 0,0,1
-	LVX (BLK_KEY)(BLK_IDX), IN0    // lvx 1,7,5
-	ADD $16, BLK_IDX, BLK_IDX      // addi 7,7,16
-	BC 0x10, 0, loop_dec           // bdnz .Loop_dec
+	VPERM	IN0, IN1, MASK, IN1     // vperm 2,1,2,5
+	VNCIPHER	ZERO, IN1, ZERO // vncipher 0,0,2
+	LVX	(BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
+	ADD	$16, BLK_IDX, BLK_IDX   // addi 7,7,16
+	VPERM	IN1, IN0, MASK, IN0     // vperm 1,2,1,5
+	VNCIPHER	ZERO, IN0, ZERO // vncipher 0,0,1
+	LVX	(BLK_KEY)(BLK_IDX), IN0 // lvx 1,7,5
+	ADD	$16, BLK_IDX, BLK_IDX   // addi 7,7,16
+	BC	0x10, 0, loop_dec       // bdnz .Loop_dec

-	VPERM IN0, IN1, MASK, IN1      // vperm 2,1,2,5
-	VNCIPHER ZERO, IN1, ZERO       // vncipher 0,0,2
-	LVX (BLK_KEY)(BLK_IDX), IN1    // lvx 2,7,5
-	VPERM IN1, IN0, MASK, IN0      // vperm 1,2,1,5
-	VNCIPHERLAST ZERO, IN0, ZERO   // vncipherlast 0,0,1
+	VPERM	IN0, IN1, MASK, IN1     // vperm 2,1,2,5
+	VNCIPHER	ZERO, IN1, ZERO // vncipher 0,0,2
+	LVX	(BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
+	VPERM	IN1, IN0, MASK, IN0     // vperm 1,2,1,5
+	VNCIPHERLAST	ZERO, IN0, ZERO // vncipherlast 0,0,1

-	VSPLTISB $-1, IN1              // vspltisb 2,-1
-	VXOR IN0, IN0, IN0             // vxor 1,1,1
-	MOVD $15, BLK_IDX              // li 7,15
-	VPERM IN1, IN0, KEY, IN1       // vperm 2,2,1,3
-	VXOR KEY, RCON, KEY            // vxor 3,3,4
-	LVX (BLK_OUT)(R0), IN0         // lvx 1,0,4
-	VPERM ZERO, ZERO, KEY, ZERO    // vperm 0,0,0,3
-	VSEL IN0, ZERO, IN1, IN0       // vsel 1,1,0,2
-	LVX (BLK_OUT)(BLK_IDX), RCON   // lvx 4,7,4
-	STVX IN0, (BLK_OUT+R0)         // stvx 1,0,4
-	VSEL ZERO, RCON, IN1, ZERO     // vsel 0,0,4,2
-	STVX ZERO, (BLK_OUT+BLK_IDX)   // stvx 0,7,4
+	VSPLTISB	$-1, IN1         // vspltisb 2,-1
+	VXOR	IN0, IN0, IN0            // vxor 1,1,1
+	MOVD	$15, BLK_IDX             // li 7,15
+	VPERM	IN1, IN0, KEY, IN1       // vperm 2,2,1,3
+	VXOR	KEY, RCON, KEY           // vxor 3,3,4
+	LVX	(BLK_OUT)(R0), IN0       // lvx 1,0,4
+	VPERM	ZERO, ZERO, KEY, ZERO    // vperm 0,0,0,3
+	VSEL	IN0, ZERO, IN1, IN0      // vsel 1,1,0,2
+	LVX	(BLK_OUT)(BLK_IDX), RCON // lvx 4,7,4
+	STVX	IN0, (BLK_OUT+R0)        // stvx 1,0,4
+	VSEL	ZERO, RCON, IN1, ZERO    // vsel 0,0,4,2
+	STVX	ZERO, (BLK_OUT+BLK_IDX)  // stvx 0,7,4
+
+	RET // blr

-	RET                            // blr
--- a/src/crypto/elliptic/elliptic.go
+++ b/src/crypto/elliptic/elliptic.go
@ -85,7 +85,7 @@ func (curve *CurveParams) polynomial(x *big.Int) *big.Int {
 func (curve *CurveParams) IsOnCurve(x, y *big.Int) bool {
 	// If there is a dedicated constant-time implementation for this curve operation,
 	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p521); ok {
+	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
 		return specific.IsOnCurve(x, y)
 	}

@ -128,7 +128,7 @@ func (curve *CurveParams) affineFromJacobian(x, y, z *big.Int) (xOut, yOut *big.
 func (curve *CurveParams) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
 	// If there is a dedicated constant-time implementation for this curve operation,
 	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p521); ok {
+	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
 		return specific.Add(x1, y1, x2, y2)
 	}

@ -218,7 +218,7 @@ func (curve *CurveParams) addJacobian(x1, y1, z1, x2, y2, z2 *big.Int) (*big.Int
 func (curve *CurveParams) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
 	// If there is a dedicated constant-time implementation for this curve operation,
 	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p521); ok {
+	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
 		return specific.Double(x1, y1)
 	}

@ -290,7 +290,7 @@ func (curve *CurveParams) doubleJacobian(x, y, z *big.Int) (*big.Int, *big.Int,
 func (curve *CurveParams) ScalarMult(Bx, By *big.Int, k []byte) (*big.Int, *big.Int) {
 	// If there is a dedicated constant-time implementation for this curve operation,
 	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p256, p521); ok {
+	if specific, ok := matchesSpecificCurve(curve, p224, p256, p384, p521); ok {
 		return specific.ScalarMult(Bx, By, k)
 	}

@ -313,7 +313,7 @@ func (curve *CurveParams) ScalarMult(Bx, By *big.Int, k []byte) (*big.Int, *big.
 func (curve *CurveParams) ScalarBaseMult(k []byte) (*big.Int, *big.Int) {
 	// If there is a dedicated constant-time implementation for this curve operation,
 	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p256, p521); ok {
+	if specific, ok := matchesSpecificCurve(curve, p224, p256, p384, p521); ok {
 		return specific.ScalarBaseMult(k)
 	}

@ -431,7 +431,6 @@ func UnmarshalCompressed(curve Curve, data []byte) (x, y *big.Int) {
 }

 var initonce sync.Once
-var p384 *CurveParams

 func initAll() {
 	initP224()
@ -440,15 +439,16 @@ func initAll() {
 	initP521()
 }

-func initP384() {
-	// See FIPS 186-3, section D.2.4
-	p384 = &CurveParams{Name: "P-384"}
-	p384.P, _ = new(big.Int).SetString("39402006196394479212279040100143613805079739270465446667948293404245721771496870329047266088258938001861606973112319", 10)
-	p384.N, _ = new(big.Int).SetString("39402006196394479212279040100143613805079739270465446667946905279627659399113263569398956308152294913554433653942643", 10)
-	p384.B, _ = new(big.Int).SetString("b3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088f5013875ac656398d8a2ed19d2a85c8edd3ec2aef", 16)
-	p384.Gx, _ = new(big.Int).SetString("aa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a385502f25dbf55296c3a545e3872760ab7", 16)
-	p384.Gy, _ = new(big.Int).SetString("3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f", 16)
-	p384.BitSize = 384
+// P224 returns a Curve which implements NIST P-224 (FIPS 186-3, section D.2.2),
+// also known as secp224r1. The CurveParams.Name of this Curve is "P-224".
+//
+// Multiple invocations of this function will return the same value, so it can
+// be used for equality checks and switch statements.
+//
+// The cryptographic operations are implemented using constant-time algorithms.
+func P224() Curve {
+	initonce.Do(initAll)
+	return p224
 }

 // P256 returns a Curve which implements NIST P-256 (FIPS 186-3, section D.2.3),
@ -470,7 +470,7 @@ func P256() Curve {
 // Multiple invocations of this function will return the same value, so it can
 // be used for equality checks and switch statements.
 //
-// The cryptographic operations do not use constant-time algorithms.
+// The cryptographic operations are implemented using constant-time algorithms.
 func P384() Curve {
 	initonce.Do(initAll)
 	return p384
--- a/src/crypto/elliptic/elliptic_test.go
+++ b/src/crypto/elliptic/elliptic_test.go
@ -14,9 +14,8 @@ import (

 // genericParamsForCurve returns the dereferenced CurveParams for
 // the specified curve. This is used to avoid the logic for
-// upgrading a curve to it's specific implementation, forcing
-// usage of the generic implementation. This is only relevant
-// for the P224, P256, and P521 curves.
+// upgrading a curve to its specific implementation, forcing
+// usage of the generic implementation.
 func genericParamsForCurve(c Curve) *CurveParams {
 	d := *(c.Params())
 	return &d
--- a/src/crypto/elliptic/internal/fiat/Dockerfile
+++ b/src/crypto/elliptic/internal/fiat/Dockerfile
@ -4,9 +4,9 @@

 FROM coqorg/coq:8.13.2

-RUN git clone https://github.com/mit-plv/fiat-crypto
-RUN cd fiat-crypto && git checkout c076f3550bea2bb7f4cb5766a32594b9e67694f2
-RUN cd fiat-crypto && git submodule update --init --recursive
+RUN git clone https://github.com/mit-plv/fiat-crypto && cd fiat-crypto && \
+    git checkout 23d2dbc4ab897d14bde4404f70cd6991635f9c01 && \
+    git submodule update --init --recursive
 RUN cd fiat-crypto && eval $(opam env) && make -j4 standalone-ocaml SKIP_BEDROCK2=1

-ENTRYPOINT ["fiat-crypto/src/ExtractionOCaml/unsaturated_solinas"]
+ENV PATH /home/coq/fiat-crypto/src/ExtractionOCaml:$PATH
--- a/src/crypto/elliptic/internal/fiat/README
+++ b/src/crypto/elliptic/internal/fiat/README
@ -1,17 +1,12 @@
 The code in this package was autogenerated by the fiat-crypto project
-at commit c076f3550 from a formally verified model.
+at version v0.0.9 from a formally verified model, and by the addchain
+project at a recent tip version.

-    docker build -t fiat-crypto:c076f3550 .
-    docker run fiat-crypto:c076f3550 --lang Go --no-wide-int --cmovznz-by-mul \
-        --internal-static --public-function-case camelCase --public-type-case camelCase \
-        --private-function-case camelCase --private-type-case camelCase \
-        --no-prefix-fiat --package-name fiat --doc-text-before-function-name '' \
-        --doc-prepend-header 'Code generated by Fiat Cryptography. DO NOT EDIT.' \
-        --doc-newline-before-package-declaration p521 64 9 '2^521 - 1' \
-        carry_mul carry_square carry add sub to_bytes from_bytes selectznz \
-        > p521_fiat64.go
+    docker build -t fiat-crypto:v0.0.9 .
+    go install github.com/mmcloughlin/addchain/cmd/addchain@v0.3.1-0.20211027081849-6a7d3decbe08
+    ../../../../../bin/go run generate.go

-It comes under the following license.
+fiat-crypto code comes under the following license.

    Copyright (c) 2015-2020 The fiat-crypto Authors. All rights reserved.

--- a/src/crypto/elliptic/internal/fiat/fiat_test.go
+++ b/src/crypto/elliptic/internal/fiat/fiat_test.go
@ -0,0 +1,64 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fiat_test
+
+import (
+	"crypto/elliptic/internal/fiat"
+	"testing"
+)
+
+func BenchmarkMul(b *testing.B) {
+	b.Run("P224", func(b *testing.B) {
+		v := new(fiat.P224Element).One()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			v.Mul(v, v)
+		}
+	})
+	b.Run("P384", func(b *testing.B) {
+		v := new(fiat.P384Element).One()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			v.Mul(v, v)
+		}
+	})
+	b.Run("P521", func(b *testing.B) {
+		v := new(fiat.P521Element).One()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			v.Mul(v, v)
+		}
+	})
+}
+
+func BenchmarkSquare(b *testing.B) {
+	b.Run("P224", func(b *testing.B) {
+		v := new(fiat.P224Element).One()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			v.Square(v)
+		}
+	})
+	b.Run("P384", func(b *testing.B) {
+		v := new(fiat.P384Element).One()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			v.Square(v)
+		}
+	})
+	b.Run("P521", func(b *testing.B) {
+		v := new(fiat.P521Element).One()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			v.Square(v)
+		}
+	})
+}
--- a/src/crypto/elliptic/internal/fiat/generate.go
+++ b/src/crypto/elliptic/internal/fiat/generate.go
@ -0,0 +1,330 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ignore
+
+package main
+
+import (
+	"bytes"
+	"go/format"
+	"io"
+	"log"
+	"os"
+	"os/exec"
+	"text/template"
+)
+
+var curves = []struct {
+	Element  string
+	Prime    string
+	Prefix   string
+	FiatType string
+	BytesLen int
+}{
+	{
+		Element:  "P224Element",
+		Prime:    "2^224 - 2^96 + 1",
+		Prefix:   "p224",
+		FiatType: "[4]uint64",
+		BytesLen: 28,
+	},
+	// The 32-bit pure Go P-256 in crypto/elliptic is still faster than the
+	// autogenerated code here, regrettably.
+	// {
+	//  Element:  "P256Element",
+	//  Prime:    "2^256 - 2^224 + 2^192 + 2^96 - 1",
+	//  Prefix:   "p256",
+	//  FiatType: "[4]uint64",
+	//  BytesLen: 32,
+	// },
+	{
+		Element:  "P384Element",
+		Prime:    "2^384 - 2^128 - 2^96 + 2^32 - 1",
+		Prefix:   "p384",
+		FiatType: "[6]uint64",
+		BytesLen: 48,
+	},
+	// Note that unsaturated_solinas would be about 2x faster than
+	// word_by_word_montgomery for P-521, but this curve is used rarely enough
+	// that it's not worth carrying unsaturated_solinas support for it.
+	{
+		Element:  "P521Element",
+		Prime:    "2^521 - 1",
+		Prefix:   "p521",
+		FiatType: "[9]uint64",
+		BytesLen: 66,
+	},
+}
+
+func main() {
+	t := template.Must(template.New("montgomery").Parse(tmplWrapper))
+
+	tmplAddchainFile, err := os.CreateTemp("", "addchain-template")
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer os.Remove(tmplAddchainFile.Name())
+	if _, err := io.WriteString(tmplAddchainFile, tmplAddchain); err != nil {
+		log.Fatal(err)
+	}
+	if err := tmplAddchainFile.Close(); err != nil {
+		log.Fatal(err)
+	}
+
+	for _, c := range curves {
+		log.Printf("Generating %s.go...", c.Prefix)
+		f, err := os.Create(c.Prefix + ".go")
+		if err != nil {
+			log.Fatal(err)
+		}
+		if err := t.Execute(f, c); err != nil {
+			log.Fatal(err)
+		}
+		if err := f.Close(); err != nil {
+			log.Fatal(err)
+		}
+
+		log.Printf("Generating %s_fiat64.go...", c.Prefix)
+		cmd := exec.Command("docker", "run", "--rm", "--entrypoint", "word_by_word_montgomery",
+			"fiat-crypto:v0.0.9", "--lang", "Go", "--no-wide-int", "--cmovznz-by-mul",
+			"--relax-primitive-carry-to-bitwidth", "32,64", "--internal-static",
+			"--public-function-case", "camelCase", "--public-type-case", "camelCase",
+			"--private-function-case", "camelCase", "--private-type-case", "camelCase",
+			"--doc-text-before-function-name", "", "--doc-newline-before-package-declaration",
+			"--doc-prepend-header", "Code generated by Fiat Cryptography. DO NOT EDIT.",
+			"--package-name", "fiat", "--no-prefix-fiat", c.Prefix, "64", c.Prime,
+			"mul", "square", "add", "sub", "one", "from_montgomery", "to_montgomery",
+			"selectznz", "to_bytes", "from_bytes")
+		cmd.Stderr = os.Stderr
+		out, err := cmd.Output()
+		if err != nil {
+			log.Fatal(err)
+		}
+		out, err = format.Source(out)
+		if err != nil {
+			log.Fatal(err)
+		}
+		if err := os.WriteFile(c.Prefix+"_fiat64.go", out, 0644); err != nil {
+			log.Fatal(err)
+		}
+
+		log.Printf("Generating %s_invert.go...", c.Prefix)
+		f, err = os.CreateTemp("", "addchain-"+c.Prefix)
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer os.Remove(f.Name())
+		cmd = exec.Command("addchain", "search", c.Prime+" - 2")
+		cmd.Stderr = os.Stderr
+		cmd.Stdout = f
+		if err := cmd.Run(); err != nil {
+			log.Fatal(err)
+		}
+		if err := f.Close(); err != nil {
+			log.Fatal(err)
+		}
+		cmd = exec.Command("addchain", "gen", "-tmpl", tmplAddchainFile.Name(), f.Name())
+		cmd.Stderr = os.Stderr
+		out, err = cmd.Output()
+		if err != nil {
+			log.Fatal(err)
+		}
+		out = bytes.Replace(out, []byte("Element"), []byte(c.Element), -1)
+		out, err = format.Source(out)
+		if err != nil {
+			log.Fatal(err)
+		}
+		if err := os.WriteFile(c.Prefix+"_invert.go", out, 0644); err != nil {
+			log.Fatal(err)
+		}
+	}
+}
+
+const tmplWrapper = `// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by generate.go. DO NOT EDIT.
+
+package fiat
+
+import (
+	"crypto/subtle"
+	"errors"
+)
+
+// {{ .Element }} is an integer modulo {{ .Prime }}.
+//
+// The zero value is a valid zero element.
+type {{ .Element }} struct {
+	// Values are represented internally always in the Montgomery domain, and
+	// converted in Bytes and SetBytes.
+	x {{ .Prefix }}MontgomeryDomainFieldElement
+}
+
+const {{ .Prefix }}ElementLen = {{ .BytesLen }}
+
+type {{ .Prefix }}UntypedFieldElement = {{ .FiatType }}
+
+// One sets e = 1, and returns e.
+func (e *{{ .Element }}) One() *{{ .Element }} {
+	{{ .Prefix }}SetOne(&e.x)
+	return e
+}
+
+// Equal returns 1 if e == t, and zero otherwise.
+func (e *{{ .Element }}) Equal(t *{{ .Element }}) int {
+	eBytes := e.Bytes()
+	tBytes := t.Bytes()
+	return subtle.ConstantTimeCompare(eBytes, tBytes)
+}
+
+var {{ .Prefix }}ZeroEncoding = new({{ .Element }}).Bytes()
+
+// IsZero returns 1 if e == 0, and zero otherwise.
+func (e *{{ .Element }}) IsZero() int {
+	eBytes := e.Bytes()
+	return subtle.ConstantTimeCompare(eBytes, {{ .Prefix }}ZeroEncoding)
+}
+
+// Set sets e = t, and returns e.
+func (e *{{ .Element }}) Set(t *{{ .Element }}) *{{ .Element }} {
+	e.x = t.x
+	return e
+}
+
+// Bytes returns the {{ .BytesLen }}-byte big-endian encoding of e.
+func (e *{{ .Element }}) Bytes() []byte {
+	// This function is outlined to make the allocations inline in the caller
+	// rather than happen on the heap.
+	var out [{{ .Prefix }}ElementLen]byte
+	return e.bytes(&out)
+}
+
+func (e *{{ .Element }}) bytes(out *[{{ .Prefix }}ElementLen]byte) []byte {
+	var tmp {{ .Prefix }}NonMontgomeryDomainFieldElement
+	{{ .Prefix }}FromMontgomery(&tmp, &e.x)
+	{{ .Prefix }}ToBytes(out, (*{{ .Prefix }}UntypedFieldElement)(&tmp))
+	{{ .Prefix }}InvertEndianness(out[:])
+	return out[:]
+}
+
+// {{ .Prefix }}MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
+// highest canonical encoding. It is used by SetBytes to check for non-canonical
+// encodings such as p + k, 2p + k, etc.
+var {{ .Prefix }}MinusOneEncoding = new({{ .Element }}).Sub(
+	new({{ .Element }}), new({{ .Element }}).One()).Bytes()
+
+// SetBytes sets e = v, where v is a big-endian {{ .BytesLen }}-byte encoding, and returns e.
+// If v is not {{ .BytesLen }} bytes or it encodes a value higher than {{ .Prime }},
+// SetBytes returns nil and an error, and e is unchanged.
+func (e *{{ .Element }}) SetBytes(v []byte) (*{{ .Element }}, error) {
+	if len(v) != {{ .Prefix }}ElementLen {
+		return nil, errors.New("invalid {{ .Element }} encoding")
+	}
+	for i := range v {
+		if v[i] < {{ .Prefix }}MinusOneEncoding[i] {
+			break
+		}
+		if v[i] > {{ .Prefix }}MinusOneEncoding[i] {
+			return nil, errors.New("invalid {{ .Element }} encoding")
+		}
+	}
+	var in [{{ .Prefix }}ElementLen]byte
+	copy(in[:], v)
+	{{ .Prefix }}InvertEndianness(in[:])
+	var tmp {{ .Prefix }}NonMontgomeryDomainFieldElement
+	{{ .Prefix }}FromBytes((*{{ .Prefix }}UntypedFieldElement)(&tmp), &in)
+	{{ .Prefix }}ToMontgomery(&e.x, &tmp)
+	return e, nil
+}
+
+// Add sets e = t1 + t2, and returns e.
+func (e *{{ .Element }}) Add(t1, t2 *{{ .Element }}) *{{ .Element }} {
+	{{ .Prefix }}Add(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Sub sets e = t1 - t2, and returns e.
+func (e *{{ .Element }}) Sub(t1, t2 *{{ .Element }}) *{{ .Element }} {
+	{{ .Prefix }}Sub(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Mul sets e = t1 * t2, and returns e.
+func (e *{{ .Element }}) Mul(t1, t2 *{{ .Element }}) *{{ .Element }} {
+	{{ .Prefix }}Mul(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Square sets e = t * t, and returns e.
+func (e *{{ .Element }}) Square(t *{{ .Element }}) *{{ .Element }} {
+	{{ .Prefix }}Square(&e.x, &t.x)
+	return e
+}
+
+// Select sets v to a if cond == 1, and to b if cond == 0.
+func (v *{{ .Element }}) Select(a, b *{{ .Element }}, cond int) *{{ .Element }} {
+	{{ .Prefix }}Selectznz((*{{ .Prefix }}UntypedFieldElement)(&v.x), {{ .Prefix }}Uint1(cond),
+		(*{{ .Prefix }}UntypedFieldElement)(&b.x), (*{{ .Prefix }}UntypedFieldElement)(&a.x))
+	return v
+}
+
+func {{ .Prefix }}InvertEndianness(v []byte) {
+	for i := 0; i < len(v)/2; i++ {
+		v[i], v[len(v)-1-i] = v[len(v)-1-i], v[i]
+	}
+}
+`
+
+const tmplAddchain = `// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by {{ .Meta.Name }}. DO NOT EDIT.
+
+package fiat
+
+// Invert sets e = 1/x, and returns e.
+//
+// If x == 0, Invert returns e = 0.
+func (e *Element) Invert(x *Element) *Element {
+	// Inversion is implemented as exponentiation with exponent p − 2.
+	// The sequence of {{ .Ops.Adds }} multiplications and {{ .Ops.Doubles }} squarings is derived from the
+	// following addition chain generated with {{ .Meta.Module }} {{ .Meta.ReleaseTag }}.
+	//
+	{{- range lines (format .Script) }}
+	//	{{ . }}
+	{{- end }}
+	//
+
+	var z = new(Element).Set(e)
+	{{- range .Program.Temporaries }}
+	var {{ . }} = new(Element)
+	{{- end }}
+	{{ range $i := .Program.Instructions -}}
+	{{- with add $i.Op }}
+	{{ $i.Output }}.Mul({{ .X }}, {{ .Y }})
+	{{- end -}}
+
+	{{- with double $i.Op }}
+	{{ $i.Output }}.Square({{ .X }})
+	{{- end -}}
+
+	{{- with shift $i.Op -}}
+	{{- $first := 0 -}}
+	{{- if ne $i.Output.Identifier .X.Identifier }}
+	{{ $i.Output }}.Square({{ .X }})
+	{{- $first = 1 -}}
+	{{- end }}
+	for s := {{ $first }}; s < {{ .S }}; s++ {
+		{{ $i.Output }}.Square({{ $i.Output }})
+	}
+	{{- end -}}
+	{{- end }}
+
+	return e.Set(z)
+}
+`
--- a/src/crypto/elliptic/internal/fiat/p224.go
+++ b/src/crypto/elliptic/internal/fiat/p224.go
@ -0,0 +1,135 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by generate.go. DO NOT EDIT.
+
+package fiat
+
+import (
+	"crypto/subtle"
+	"errors"
+)
+
+// P224Element is an integer modulo 2^224 - 2^96 + 1.
+//
+// The zero value is a valid zero element.
+type P224Element struct {
+	// Values are represented internally always in the Montgomery domain, and
+	// converted in Bytes and SetBytes.
+	x p224MontgomeryDomainFieldElement
+}
+
+const p224ElementLen = 28
+
+type p224UntypedFieldElement = [4]uint64
+
+// One sets e = 1, and returns e.
+func (e *P224Element) One() *P224Element {
+	p224SetOne(&e.x)
+	return e
+}
+
+// Equal returns 1 if e == t, and zero otherwise.
+func (e *P224Element) Equal(t *P224Element) int {
+	eBytes := e.Bytes()
+	tBytes := t.Bytes()
+	return subtle.ConstantTimeCompare(eBytes, tBytes)
+}
+
+var p224ZeroEncoding = new(P224Element).Bytes()
+
+// IsZero returns 1 if e == 0, and zero otherwise.
+func (e *P224Element) IsZero() int {
+	eBytes := e.Bytes()
+	return subtle.ConstantTimeCompare(eBytes, p224ZeroEncoding)
+}
+
+// Set sets e = t, and returns e.
+func (e *P224Element) Set(t *P224Element) *P224Element {
+	e.x = t.x
+	return e
+}
+
+// Bytes returns the 28-byte big-endian encoding of e.
+func (e *P224Element) Bytes() []byte {
+	// This function is outlined to make the allocations inline in the caller
+	// rather than happen on the heap.
+	var out [p224ElementLen]byte
+	return e.bytes(&out)
+}
+
+func (e *P224Element) bytes(out *[p224ElementLen]byte) []byte {
+	var tmp p224NonMontgomeryDomainFieldElement
+	p224FromMontgomery(&tmp, &e.x)
+	p224ToBytes(out, (*p224UntypedFieldElement)(&tmp))
+	p224InvertEndianness(out[:])
+	return out[:]
+}
+
+// p224MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
+// highest canonical encoding. It is used by SetBytes to check for non-canonical
+// encodings such as p + k, 2p + k, etc.
+var p224MinusOneEncoding = new(P224Element).Sub(
+	new(P224Element), new(P224Element).One()).Bytes()
+
+// SetBytes sets e = v, where v is a big-endian 28-byte encoding, and returns e.
+// If v is not 28 bytes or it encodes a value higher than 2^224 - 2^96 + 1,
+// SetBytes returns nil and an error, and e is unchanged.
+func (e *P224Element) SetBytes(v []byte) (*P224Element, error) {
+	if len(v) != p224ElementLen {
+		return nil, errors.New("invalid P224Element encoding")
+	}
+	for i := range v {
+		if v[i] < p224MinusOneEncoding[i] {
+			break
+		}
+		if v[i] > p224MinusOneEncoding[i] {
+			return nil, errors.New("invalid P224Element encoding")
+		}
+	}
+	var in [p224ElementLen]byte
+	copy(in[:], v)
+	p224InvertEndianness(in[:])
+	var tmp p224NonMontgomeryDomainFieldElement
+	p224FromBytes((*p224UntypedFieldElement)(&tmp), &in)
+	p224ToMontgomery(&e.x, &tmp)
+	return e, nil
+}
+
+// Add sets e = t1 + t2, and returns e.
+func (e *P224Element) Add(t1, t2 *P224Element) *P224Element {
+	p224Add(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Sub sets e = t1 - t2, and returns e.
+func (e *P224Element) Sub(t1, t2 *P224Element) *P224Element {
+	p224Sub(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Mul sets e = t1 * t2, and returns e.
+func (e *P224Element) Mul(t1, t2 *P224Element) *P224Element {
+	p224Mul(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Square sets e = t * t, and returns e.
+func (e *P224Element) Square(t *P224Element) *P224Element {
+	p224Square(&e.x, &t.x)
+	return e
+}
+
+// Select sets v to a if cond == 1, and to b if cond == 0.
+func (v *P224Element) Select(a, b *P224Element, cond int) *P224Element {
+	p224Selectznz((*p224UntypedFieldElement)(&v.x), p224Uint1(cond),
+		(*p224UntypedFieldElement)(&b.x), (*p224UntypedFieldElement)(&a.x))
+	return v
+}
+
+func p224InvertEndianness(v []byte) {
+	for i := 0; i < len(v)/2; i++ {
+		v[i], v[len(v)-1-i] = v[len(v)-1-i], v[i]
+	}
+}
--- a/src/crypto/elliptic/internal/fiat/p224_fiat64.go
+++ b/src/crypto/elliptic/internal/fiat/p224_fiat64.go
--- a/src/crypto/elliptic/internal/fiat/p224_invert.go
+++ b/src/crypto/elliptic/internal/fiat/p224_invert.go
@ -0,0 +1,87 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by addchain. DO NOT EDIT.
+
+package fiat
+
+// Invert sets e = 1/x, and returns e.
+//
+// If x == 0, Invert returns e = 0.
+func (e *P224Element) Invert(x *P224Element) *P224Element {
+	// Inversion is implemented as exponentiation with exponent p − 2.
+	// The sequence of 11 multiplications and 223 squarings is derived from the
+	// following addition chain generated with github.com/mmcloughlin/addchain v0.3.0.
+	//
+	//	_10     = 2*1
+	//	_11     = 1 + _10
+	//	_110    = 2*_11
+	//	_111    = 1 + _110
+	//	_111000 = _111 << 3
+	//	_111111 = _111 + _111000
+	//	x12     = _111111 << 6 + _111111
+	//	x14     = x12 << 2 + _11
+	//	x17     = x14 << 3 + _111
+	//	x31     = x17 << 14 + x14
+	//	x48     = x31 << 17 + x17
+	//	x96     = x48 << 48 + x48
+	//	x127    = x96 << 31 + x31
+	//	return    x127 << 97 + x96
+	//
+
+	var z = new(P224Element).Set(e)
+	var t0 = new(P224Element)
+	var t1 = new(P224Element)
+	var t2 = new(P224Element)
+
+	z.Square(x)
+	t0.Mul(x, z)
+	z.Square(t0)
+	z.Mul(x, z)
+	t1.Square(z)
+	for s := 1; s < 3; s++ {
+		t1.Square(t1)
+	}
+	t1.Mul(z, t1)
+	t2.Square(t1)
+	for s := 1; s < 6; s++ {
+		t2.Square(t2)
+	}
+	t1.Mul(t1, t2)
+	for s := 0; s < 2; s++ {
+		t1.Square(t1)
+	}
+	t0.Mul(t0, t1)
+	t1.Square(t0)
+	for s := 1; s < 3; s++ {
+		t1.Square(t1)
+	}
+	z.Mul(z, t1)
+	t1.Square(z)
+	for s := 1; s < 14; s++ {
+		t1.Square(t1)
+	}
+	t0.Mul(t0, t1)
+	t1.Square(t0)
+	for s := 1; s < 17; s++ {
+		t1.Square(t1)
+	}
+	z.Mul(z, t1)
+	t1.Square(z)
+	for s := 1; s < 48; s++ {
+		t1.Square(t1)
+	}
+	z.Mul(z, t1)
+	t1.Square(z)
+	for s := 1; s < 31; s++ {
+		t1.Square(t1)
+	}
+	t0.Mul(t0, t1)
+	for s := 0; s < 97; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+
+	return e.Set(z)
+}
--- a/src/crypto/elliptic/internal/fiat/p384.go
+++ b/src/crypto/elliptic/internal/fiat/p384.go
@ -0,0 +1,135 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by generate.go. DO NOT EDIT.
+
+package fiat
+
+import (
+	"crypto/subtle"
+	"errors"
+)
+
+// P384Element is an integer modulo 2^384 - 2^128 - 2^96 + 2^32 - 1.
+//
+// The zero value is a valid zero element.
+type P384Element struct {
+	// Values are represented internally always in the Montgomery domain, and
+	// converted in Bytes and SetBytes.
+	x p384MontgomeryDomainFieldElement
+}
+
+const p384ElementLen = 48
+
+type p384UntypedFieldElement = [6]uint64
+
+// One sets e = 1, and returns e.
+func (e *P384Element) One() *P384Element {
+	p384SetOne(&e.x)
+	return e
+}
+
+// Equal returns 1 if e == t, and zero otherwise.
+func (e *P384Element) Equal(t *P384Element) int {
+	eBytes := e.Bytes()
+	tBytes := t.Bytes()
+	return subtle.ConstantTimeCompare(eBytes, tBytes)
+}
+
+var p384ZeroEncoding = new(P384Element).Bytes()
+
+// IsZero returns 1 if e == 0, and zero otherwise.
+func (e *P384Element) IsZero() int {
+	eBytes := e.Bytes()
+	return subtle.ConstantTimeCompare(eBytes, p384ZeroEncoding)
+}
+
+// Set sets e = t, and returns e.
+func (e *P384Element) Set(t *P384Element) *P384Element {
+	e.x = t.x
+	return e
+}
+
+// Bytes returns the 48-byte big-endian encoding of e.
+func (e *P384Element) Bytes() []byte {
+	// This function is outlined to make the allocations inline in the caller
+	// rather than happen on the heap.
+	var out [p384ElementLen]byte
+	return e.bytes(&out)
+}
+
+func (e *P384Element) bytes(out *[p384ElementLen]byte) []byte {
+	var tmp p384NonMontgomeryDomainFieldElement
+	p384FromMontgomery(&tmp, &e.x)
+	p384ToBytes(out, (*p384UntypedFieldElement)(&tmp))
+	p384InvertEndianness(out[:])
+	return out[:]
+}
+
+// p384MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
+// highest canonical encoding. It is used by SetBytes to check for non-canonical
+// encodings such as p + k, 2p + k, etc.
+var p384MinusOneEncoding = new(P384Element).Sub(
+	new(P384Element), new(P384Element).One()).Bytes()
+
+// SetBytes sets e = v, where v is a big-endian 48-byte encoding, and returns e.
+// If v is not 48 bytes or it encodes a value higher than 2^384 - 2^128 - 2^96 + 2^32 - 1,
+// SetBytes returns nil and an error, and e is unchanged.
+func (e *P384Element) SetBytes(v []byte) (*P384Element, error) {
+	if len(v) != p384ElementLen {
+		return nil, errors.New("invalid P384Element encoding")
+	}
+	for i := range v {
+		if v[i] < p384MinusOneEncoding[i] {
+			break
+		}
+		if v[i] > p384MinusOneEncoding[i] {
+			return nil, errors.New("invalid P384Element encoding")
+		}
+	}
+	var in [p384ElementLen]byte
+	copy(in[:], v)
+	p384InvertEndianness(in[:])
+	var tmp p384NonMontgomeryDomainFieldElement
+	p384FromBytes((*p384UntypedFieldElement)(&tmp), &in)
+	p384ToMontgomery(&e.x, &tmp)
+	return e, nil
+}
+
+// Add sets e = t1 + t2, and returns e.
+func (e *P384Element) Add(t1, t2 *P384Element) *P384Element {
+	p384Add(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Sub sets e = t1 - t2, and returns e.
+func (e *P384Element) Sub(t1, t2 *P384Element) *P384Element {
+	p384Sub(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Mul sets e = t1 * t2, and returns e.
+func (e *P384Element) Mul(t1, t2 *P384Element) *P384Element {
+	p384Mul(&e.x, &t1.x, &t2.x)
+	return e
+}
+
+// Square sets e = t * t, and returns e.
+func (e *P384Element) Square(t *P384Element) *P384Element {
+	p384Square(&e.x, &t.x)
+	return e
+}
+
+// Select sets v to a if cond == 1, and to b if cond == 0.
+func (v *P384Element) Select(a, b *P384Element, cond int) *P384Element {
+	p384Selectznz((*p384UntypedFieldElement)(&v.x), p384Uint1(cond),
+		(*p384UntypedFieldElement)(&b.x), (*p384UntypedFieldElement)(&a.x))
+	return v
+}
+
+func p384InvertEndianness(v []byte) {
+	for i := 0; i < len(v)/2; i++ {
+		v[i], v[len(v)-1-i] = v[len(v)-1-i], v[i]
+	}
+}
--- a/src/crypto/elliptic/internal/fiat/p384_fiat64.go
+++ b/src/crypto/elliptic/internal/fiat/p384_fiat64.go
--- a/src/crypto/elliptic/internal/fiat/p384_invert.go
+++ b/src/crypto/elliptic/internal/fiat/p384_invert.go
@ -0,0 +1,102 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by addchain. DO NOT EDIT.
+
+package fiat
+
+// Invert sets e = 1/x, and returns e.
+//
+// If x == 0, Invert returns e = 0.
+func (e *P384Element) Invert(x *P384Element) *P384Element {
+	// Inversion is implemented as exponentiation with exponent p − 2.
+	// The sequence of 15 multiplications and 383 squarings is derived from the
+	// following addition chain generated with github.com/mmcloughlin/addchain v0.3.0.
+	//
+	//	_10     = 2*1
+	//	_11     = 1 + _10
+	//	_110    = 2*_11
+	//	_111    = 1 + _110
+	//	_111000 = _111 << 3
+	//	_111111 = _111 + _111000
+	//	x12     = _111111 << 6 + _111111
+	//	x24     = x12 << 12 + x12
+	//	x30     = x24 << 6 + _111111
+	//	x31     = 2*x30 + 1
+	//	x32     = 2*x31 + 1
+	//	x63     = x32 << 31 + x31
+	//	x126    = x63 << 63 + x63
+	//	x252    = x126 << 126 + x126
+	//	x255    = x252 << 3 + _111
+	//	i397    = ((x255 << 33 + x32) << 94 + x30) << 2
+	//	return    1 + i397
+	//
+
+	var z = new(P384Element).Set(e)
+	var t0 = new(P384Element)
+	var t1 = new(P384Element)
+	var t2 = new(P384Element)
+	var t3 = new(P384Element)
+
+	z.Square(x)
+	z.Mul(x, z)
+	z.Square(z)
+	t1.Mul(x, z)
+	z.Square(t1)
+	for s := 1; s < 3; s++ {
+		z.Square(z)
+	}
+	z.Mul(t1, z)
+	t0.Square(z)
+	for s := 1; s < 6; s++ {
+		t0.Square(t0)
+	}
+	t0.Mul(z, t0)
+	t2.Square(t0)
+	for s := 1; s < 12; s++ {
+		t2.Square(t2)
+	}
+	t0.Mul(t0, t2)
+	for s := 0; s < 6; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	t0.Square(z)
+	t2.Mul(x, t0)
+	t0.Square(t2)
+	t0.Mul(x, t0)
+	t3.Square(t0)
+	for s := 1; s < 31; s++ {
+		t3.Square(t3)
+	}
+	t2.Mul(t2, t3)
+	t3.Square(t2)
+	for s := 1; s < 63; s++ {
+		t3.Square(t3)
+	}
+	t2.Mul(t2, t3)
+	t3.Square(t2)
+	for s := 1; s < 126; s++ {
+		t3.Square(t3)
+	}
+	t2.Mul(t2, t3)
+	for s := 0; s < 3; s++ {
+		t2.Square(t2)
+	}
+	t1.Mul(t1, t2)
+	for s := 0; s < 33; s++ {
+		t1.Square(t1)
+	}
+	t0.Mul(t0, t1)
+	for s := 0; s < 94; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	for s := 0; s < 2; s++ {
+		z.Square(z)
+	}
+	z.Mul(x, z)
+
+	return e.Set(z)
+}
--- a/src/crypto/elliptic/internal/fiat/p521.go
+++ b/src/crypto/elliptic/internal/fiat/p521.go
@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// Package fiat implements prime order fields using formally verified algorithms
-// from the Fiat Cryptography project.
+// Code generated by generate.go. DO NOT EDIT.
+
 package fiat

 import (
@ -15,20 +15,18 @@ import (
 //
 // The zero value is a valid zero element.
 type P521Element struct {
-	// This element has the following bounds, which are tighter than
-	// the output bounds of some operations. Those operations must be
-	// followed by a carry.
-	//
-	// [0x0 ~> 0x400000000000000], [0x0 ~> 0x400000000000000], [0x0 ~> 0x400000000000000],
-	// [0x0 ~> 0x400000000000000], [0x0 ~> 0x400000000000000], [0x0 ~> 0x400000000000000],
-	// [0x0 ~> 0x400000000000000], [0x0 ~> 0x400000000000000], [0x0 ~> 0x200000000000000]
-	x [9]uint64
+	// Values are represented internally always in the Montgomery domain, and
+	// converted in Bytes and SetBytes.
+	x p521MontgomeryDomainFieldElement
 }

+const p521ElementLen = 66
+
+type p521UntypedFieldElement = [9]uint64
+
 // One sets e = 1, and returns e.
 func (e *P521Element) One() *P521Element {
-	*e = P521Element{}
-	e.x[0] = 1
+	p521SetOne(&e.x)
 	return e
 }

@ -57,153 +55,81 @@ func (e *P521Element) Set(t *P521Element) *P521Element {
 func (e *P521Element) Bytes() []byte {
 	// This function is outlined to make the allocations inline in the caller
 	// rather than happen on the heap.
-	var out [66]byte
+	var out [p521ElementLen]byte
 	return e.bytes(&out)
 }

-func (e *P521Element) bytes(out *[66]byte) []byte {
-	p521ToBytes(out, &e.x)
-	invertEndianness(out[:])
+func (e *P521Element) bytes(out *[p521ElementLen]byte) []byte {
+	var tmp p521NonMontgomeryDomainFieldElement
+	p521FromMontgomery(&tmp, &e.x)
+	p521ToBytes(out, (*p521UntypedFieldElement)(&tmp))
+	p521InvertEndianness(out[:])
 	return out[:]
 }

-// SetBytes sets e = v, where v is a big-endian 66-byte encoding, and returns
-// e. If v is not 66 bytes or it encodes a value higher than 2^521 - 1, SetBytes
-// returns nil and an error, and e is unchanged.
-func (e *P521Element) SetBytes(v []byte) (*P521Element, error) {
-	if len(v) != 66 || v[0] > 1 {
-		return nil, errors.New("invalid P-521 field encoding")
-	}
-	var in [66]byte
-	copy(in[:], v)
-	invertEndianness(in[:])
-	p521FromBytes(&e.x, &in)
-	return e, nil
-}
+// p521MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
+// highest canonical encoding. It is used by SetBytes to check for non-canonical
+// encodings such as p + k, 2p + k, etc.
+var p521MinusOneEncoding = new(P521Element).Sub(
+	new(P521Element), new(P521Element).One()).Bytes()

-func invertEndianness(v []byte) {
-	for i := 0; i < len(v)/2; i++ {
-		v[i], v[len(v)-1-i] = v[len(v)-1-i], v[i]
+// SetBytes sets e = v, where v is a big-endian 66-byte encoding, and returns e.
+// If v is not 66 bytes or it encodes a value higher than 2^521 - 1,
+// SetBytes returns nil and an error, and e is unchanged.
+func (e *P521Element) SetBytes(v []byte) (*P521Element, error) {
+	if len(v) != p521ElementLen {
+		return nil, errors.New("invalid P521Element encoding")
 	}
+	for i := range v {
+		if v[i] < p521MinusOneEncoding[i] {
+			break
+		}
+		if v[i] > p521MinusOneEncoding[i] {
+			return nil, errors.New("invalid P521Element encoding")
+		}
+	}
+	var in [p521ElementLen]byte
+	copy(in[:], v)
+	p521InvertEndianness(in[:])
+	var tmp p521NonMontgomeryDomainFieldElement
+	p521FromBytes((*p521UntypedFieldElement)(&tmp), &in)
+	p521ToMontgomery(&e.x, &tmp)
+	return e, nil
 }

 // Add sets e = t1 + t2, and returns e.
 func (e *P521Element) Add(t1, t2 *P521Element) *P521Element {
 	p521Add(&e.x, &t1.x, &t2.x)
-	p521Carry(&e.x, &e.x)
 	return e
 }

 // Sub sets e = t1 - t2, and returns e.
 func (e *P521Element) Sub(t1, t2 *P521Element) *P521Element {
 	p521Sub(&e.x, &t1.x, &t2.x)
-	p521Carry(&e.x, &e.x)
 	return e
 }

 // Mul sets e = t1 * t2, and returns e.
 func (e *P521Element) Mul(t1, t2 *P521Element) *P521Element {
-	p521CarryMul(&e.x, &t1.x, &t2.x)
+	p521Mul(&e.x, &t1.x, &t2.x)
 	return e
 }

 // Square sets e = t * t, and returns e.
 func (e *P521Element) Square(t *P521Element) *P521Element {
-	p521CarrySquare(&e.x, &t.x)
+	p521Square(&e.x, &t.x)
 	return e
 }

-// Select sets e to a if cond == 1, and to b if cond == 0.
+// Select sets v to a if cond == 1, and to b if cond == 0.
 func (v *P521Element) Select(a, b *P521Element, cond int) *P521Element {
-	p521Selectznz(&v.x, p521Uint1(cond), &b.x, &a.x)
+	p521Selectznz((*p521UntypedFieldElement)(&v.x), p521Uint1(cond),
+		(*p521UntypedFieldElement)(&b.x), (*p521UntypedFieldElement)(&a.x))
 	return v
 }

-// Invert sets e = 1/t, and returns e.
-//
-// If t == 0, Invert returns e = 0.
-func (e *P521Element) Invert(t *P521Element) *P521Element {
-	// Inversion is implemented as exponentiation with exponent p − 2.
-	// The sequence of multiplications and squarings was generated with
-	// github.com/mmcloughlin/addchain v0.2.0.
-
-	var t1, t2 = new(P521Element), new(P521Element)
-
-	// _10 = 2 * 1
-	t1.Square(t)
-
-	// _11 = 1 + _10
-	t1.Mul(t, t1)
-
-	// _1100 = _11 << 2
-	t2.Square(t1)
-	t2.Square(t2)
-
-	// _1111 = _11 + _1100
-	t1.Mul(t1, t2)
-
-	// _11110000 = _1111 << 4
-	t2.Square(t1)
-	for i := 0; i < 3; i++ {
-		t2.Square(t2)
+func p521InvertEndianness(v []byte) {
+	for i := 0; i < len(v)/2; i++ {
+		v[i], v[len(v)-1-i] = v[len(v)-1-i], v[i]
 	}
-
-	// _11111111 = _1111 + _11110000
-	t1.Mul(t1, t2)
-
-	// x16 = _11111111<<8 + _11111111
-	t2.Square(t1)
-	for i := 0; i < 7; i++ {
-		t2.Square(t2)
-	}
-	t1.Mul(t1, t2)
-
-	// x32 = x16<<16 + x16
-	t2.Square(t1)
-	for i := 0; i < 15; i++ {
-		t2.Square(t2)
-	}
-	t1.Mul(t1, t2)
-
-	// x64 = x32<<32 + x32
-	t2.Square(t1)
-	for i := 0; i < 31; i++ {
-		t2.Square(t2)
-	}
-	t1.Mul(t1, t2)
-
-	// x65 = 2*x64 + 1
-	t2.Square(t1)
-	t2.Mul(t2, t)
-
-	// x129 = x65<<64 + x64
-	for i := 0; i < 64; i++ {
-		t2.Square(t2)
-	}
-	t1.Mul(t1, t2)
-
-	// x130 = 2*x129 + 1
-	t2.Square(t1)
-	t2.Mul(t2, t)
-
-	// x259 = x130<<129 + x129
-	for i := 0; i < 129; i++ {
-		t2.Square(t2)
-	}
-	t1.Mul(t1, t2)
-
-	// x260 = 2*x259 + 1
-	t2.Square(t1)
-	t2.Mul(t2, t)
-
-	// x519 = x260<<259 + x259
-	for i := 0; i < 259; i++ {
-		t2.Square(t2)
-	}
-	t1.Mul(t1, t2)
-
-	// return x519<<2 + 1
-	t1.Square(t1)
-	t1.Square(t1)
-	return e.Mul(t1, t)
 }
--- a/src/crypto/elliptic/internal/fiat/p521_fiat64.go
+++ b/src/crypto/elliptic/internal/fiat/p521_fiat64.go
--- a/src/crypto/elliptic/internal/fiat/p521_invert.go
+++ b/src/crypto/elliptic/internal/fiat/p521_invert.go
@ -0,0 +1,89 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by addchain. DO NOT EDIT.
+
+package fiat
+
+// Invert sets e = 1/x, and returns e.
+//
+// If x == 0, Invert returns e = 0.
+func (e *P521Element) Invert(x *P521Element) *P521Element {
+	// Inversion is implemented as exponentiation with exponent p − 2.
+	// The sequence of 13 multiplications and 520 squarings is derived from the
+	// following addition chain generated with github.com/mmcloughlin/addchain v0.3.0.
+	//
+	//	_10       = 2*1
+	//	_11       = 1 + _10
+	//	_1100     = _11 << 2
+	//	_1111     = _11 + _1100
+	//	_11110000 = _1111 << 4
+	//	_11111111 = _1111 + _11110000
+	//	x16       = _11111111 << 8 + _11111111
+	//	x32       = x16 << 16 + x16
+	//	x64       = x32 << 32 + x32
+	//	x65       = 2*x64 + 1
+	//	x129      = x65 << 64 + x64
+	//	x130      = 2*x129 + 1
+	//	x259      = x130 << 129 + x129
+	//	x260      = 2*x259 + 1
+	//	x519      = x260 << 259 + x259
+	//	return      x519 << 2 + 1
+	//
+
+	var z = new(P521Element).Set(e)
+	var t0 = new(P521Element)
+
+	z.Square(x)
+	z.Mul(x, z)
+	t0.Square(z)
+	for s := 1; s < 2; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	t0.Square(z)
+	for s := 1; s < 4; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	t0.Square(z)
+	for s := 1; s < 8; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	t0.Square(z)
+	for s := 1; s < 16; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	t0.Square(z)
+	for s := 1; s < 32; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	t0.Square(z)
+	t0.Mul(x, t0)
+	for s := 0; s < 64; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	t0.Square(z)
+	t0.Mul(x, t0)
+	for s := 0; s < 129; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	t0.Square(z)
+	t0.Mul(x, t0)
+	for s := 0; s < 259; s++ {
+		t0.Square(t0)
+	}
+	z.Mul(z, t0)
+	for s := 0; s < 2; s++ {
+		z.Square(z)
+	}
+	z.Mul(x, z)
+
+	return e.Set(z)
+}
--- a/src/crypto/elliptic/internal/fiat/p521_test.go
+++ b/src/crypto/elliptic/internal/fiat/p521_test.go
@ -1,37 +0,0 @@
-// Copyright 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package fiat_test
-
-import (
-	"crypto/elliptic/internal/fiat"
-	"crypto/rand"
-	"testing"
-)
-
-func p521Random(t *testing.T) *fiat.P521Element {
-	buf := make([]byte, 66)
-	if _, err := rand.Read(buf); err != nil {
-		t.Fatal(err)
-	}
-	buf[0] &= 1
-	e, err := new(fiat.P521Element).SetBytes(buf)
-	if err != nil {
-		t.Fatal(err)
-	}
-	return e
-}
-
-func TestP521Invert(t *testing.T) {
-	a := p521Random(t)
-	inv := new(fiat.P521Element).Invert(a)
-	one := new(fiat.P521Element).Mul(a, inv)
-	if new(fiat.P521Element).One().Equal(one) != 1 {
-		t.Errorf("a * 1/a != 1; got %x for %x", one.Bytes(), a.Bytes())
-	}
-	inv.Invert(new(fiat.P521Element))
-	if new(fiat.P521Element).Equal(inv) != 1 {
-		t.Errorf("1/0 != 0; got %x", inv.Bytes())
-	}
-}
--- a/src/crypto/elliptic/internal/nistec/nistec_test.go
+++ b/src/crypto/elliptic/internal/nistec/nistec_test.go
@ -0,0 +1,94 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package nistec_test
+
+import (
+	"crypto/elliptic/internal/nistec"
+	"math/rand"
+	"os"
+	"strings"
+	"testing"
+)
+
+func TestAllocations(t *testing.T) {
+	if strings.HasSuffix(os.Getenv("GO_BUILDER_NAME"), "-noopt") {
+		t.Skip("skipping allocations test without relevant optimizations")
+	}
+	t.Run("P224", func(t *testing.T) {
+		if allocs := testing.AllocsPerRun(100, func() {
+			p := nistec.NewP224Generator()
+			scalar := make([]byte, 66)
+			rand.Read(scalar)
+			p.ScalarMult(p, scalar)
+			out := p.Bytes()
+			if _, err := p.SetBytes(out); err != nil {
+				t.Fatal(err)
+			}
+		}); allocs > 0 {
+			t.Errorf("expected zero allocations, got %0.1f", allocs)
+		}
+	})
+	t.Run("P384", func(t *testing.T) {
+		if allocs := testing.AllocsPerRun(100, func() {
+			p := nistec.NewP384Generator()
+			scalar := make([]byte, 66)
+			rand.Read(scalar)
+			p.ScalarMult(p, scalar)
+			out := p.Bytes()
+			if _, err := p.SetBytes(out); err != nil {
+				t.Fatal(err)
+			}
+		}); allocs > 0 {
+			t.Errorf("expected zero allocations, got %0.1f", allocs)
+		}
+	})
+	t.Run("P521", func(t *testing.T) {
+		if allocs := testing.AllocsPerRun(100, func() {
+			p := nistec.NewP521Generator()
+			scalar := make([]byte, 66)
+			rand.Read(scalar)
+			p.ScalarMult(p, scalar)
+			out := p.Bytes()
+			if _, err := p.SetBytes(out); err != nil {
+				t.Fatal(err)
+			}
+		}); allocs > 0 {
+			t.Errorf("expected zero allocations, got %0.1f", allocs)
+		}
+	})
+}
+
+func BenchmarkScalarMult(b *testing.B) {
+	b.Run("P224", func(b *testing.B) {
+		scalar := make([]byte, 66)
+		rand.Read(scalar)
+		p := nistec.NewP224Generator()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			p.ScalarMult(p, scalar)
+		}
+	})
+	b.Run("P384", func(b *testing.B) {
+		scalar := make([]byte, 66)
+		rand.Read(scalar)
+		p := nistec.NewP384Generator()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			p.ScalarMult(p, scalar)
+		}
+	})
+	b.Run("P521", func(b *testing.B) {
+		scalar := make([]byte, 66)
+		rand.Read(scalar)
+		p := nistec.NewP521Generator()
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			p.ScalarMult(p, scalar)
+		}
+	})
+}
--- a/src/crypto/elliptic/internal/nistec/p224.go
+++ b/src/crypto/elliptic/internal/nistec/p224.go
@ -0,0 +1,293 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package nistec
+
+import (
+	"crypto/elliptic/internal/fiat"
+	"crypto/subtle"
+	"errors"
+)
+
+var p224B, _ = new(fiat.P224Element).SetBytes([]byte{0xb4, 0x05, 0x0a, 0x85,
+	0x0c, 0x04, 0xb3, 0xab, 0xf5, 0x41, 0x32, 0x56, 0x50, 0x44, 0xb0, 0xb7,
+	0xd7, 0xbf, 0xd8, 0xba, 0x27, 0x0b, 0x39, 0x43, 0x23, 0x55, 0xff, 0xb4})
+
+var p224G, _ = NewP224Point().SetBytes([]byte{0x04,
+	0xb7, 0x0e, 0x0c, 0xbd, 0x6b, 0xb4, 0xbf, 0x7f, 0x32, 0x13, 0x90, 0xb9,
+	0x4a, 0x03, 0xc1, 0xd3, 0x56, 0xc2, 0x11, 0x22, 0x34, 0x32, 0x80, 0xd6,
+	0x11, 0x5c, 0x1d, 0x21, 0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb,
+	0x4c, 0x22, 0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
+	0x44, 0xd5, 0x81, 0x99, 0x85, 0x0, 0x7e, 0x34})
+
+const p224ElementLength = 28
+
+// P224Point is a P-224 point. The zero value is NOT valid.
+type P224Point struct {
+	// The point is represented in projective coordinates (X:Y:Z),
+	// where x = X/Z and y = Y/Z.
+	x, y, z *fiat.P224Element
+}
+
+// NewP224Point returns a new P224Point representing the point at infinity point.
+func NewP224Point() *P224Point {
+	return &P224Point{
+		x: new(fiat.P224Element),
+		y: new(fiat.P224Element).One(),
+		z: new(fiat.P224Element),
+	}
+}
+
+// NewP224Generator returns a new P224Point set to the canonical generator.
+func NewP224Generator() *P224Point {
+	return (&P224Point{
+		x: new(fiat.P224Element),
+		y: new(fiat.P224Element),
+		z: new(fiat.P224Element),
+	}).Set(p224G)
+}
+
+// Set sets p = q and returns p.
+func (p *P224Point) Set(q *P224Point) *P224Point {
+	p.x.Set(q.x)
+	p.y.Set(q.y)
+	p.z.Set(q.z)
+	return p
+}
+
+// SetBytes sets p to the compressed, uncompressed, or infinity value encoded in
+// b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on
+// the curve, it returns nil and an error, and the receiver is unchanged.
+// Otherwise, it returns p.
+func (p *P224Point) SetBytes(b []byte) (*P224Point, error) {
+	switch {
+	// Point at infinity.
+	case len(b) == 1 && b[0] == 0:
+		return p.Set(NewP224Point()), nil
+
+	// Uncompressed form.
+	case len(b) == 1+2*p224ElementLength && b[0] == 4:
+		x, err := new(fiat.P224Element).SetBytes(b[1 : 1+p224ElementLength])
+		if err != nil {
+			return nil, err
+		}
+		y, err := new(fiat.P224Element).SetBytes(b[1+p224ElementLength:])
+		if err != nil {
+			return nil, err
+		}
+		if err := p224CheckOnCurve(x, y); err != nil {
+			return nil, err
+		}
+		p.x.Set(x)
+		p.y.Set(y)
+		p.z.One()
+		return p, nil
+
+	// Compressed form
+	case len(b) == 1+p224ElementLength && b[0] == 0:
+		return nil, errors.New("unimplemented") // TODO(filippo)
+
+	default:
+		return nil, errors.New("invalid P224 point encoding")
+	}
+}
+
+func p224CheckOnCurve(x, y *fiat.P224Element) error {
+	// x³ - 3x + b.
+	x3 := new(fiat.P224Element).Square(x)
+	x3.Mul(x3, x)
+
+	threeX := new(fiat.P224Element).Add(x, x)
+	threeX.Add(threeX, x)
+
+	x3.Sub(x3, threeX)
+	x3.Add(x3, p224B)
+
+	// y² = x³ - 3x + b
+	y2 := new(fiat.P224Element).Square(y)
+
+	if x3.Equal(y2) != 1 {
+		return errors.New("P224 point not on curve")
+	}
+	return nil
+}
+
+// Bytes returns the uncompressed or infinity encoding of p, as specified in
+// SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at
+// infinity is shorter than all other encodings.
+func (p *P224Point) Bytes() []byte {
+	// This function is outlined to make the allocations inline in the caller
+	// rather than happen on the heap.
+	var out [133]byte
+	return p.bytes(&out)
+}
+
+func (p *P224Point) bytes(out *[133]byte) []byte {
+	if p.z.IsZero() == 1 {
+		return append(out[:0], 0)
+	}
+
+	zinv := new(fiat.P224Element).Invert(p.z)
+	xx := new(fiat.P224Element).Mul(p.x, zinv)
+	yy := new(fiat.P224Element).Mul(p.y, zinv)
+
+	buf := append(out[:0], 4)
+	buf = append(buf, xx.Bytes()...)
+	buf = append(buf, yy.Bytes()...)
+	return buf
+}
+
+// Add sets q = p1 + p2, and returns q. The points may overlap.
+func (q *P224Point) Add(p1, p2 *P224Point) *P224Point {
+	// Complete addition formula for a = -3 from "Complete addition formulas for
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
+
+	t0 := new(fiat.P224Element).Mul(p1.x, p2.x) // t0 := X1 * X2
+	t1 := new(fiat.P224Element).Mul(p1.y, p2.y) // t1 := Y1 * Y2
+	t2 := new(fiat.P224Element).Mul(p1.z, p2.z) // t2 := Z1 * Z2
+	t3 := new(fiat.P224Element).Add(p1.x, p1.y) // t3 := X1 + Y1
+	t4 := new(fiat.P224Element).Add(p2.x, p2.y) // t4 := X2 + Y2
+	t3.Mul(t3, t4)                              // t3 := t3 * t4
+	t4.Add(t0, t1)                              // t4 := t0 + t1
+	t3.Sub(t3, t4)                              // t3 := t3 - t4
+	t4.Add(p1.y, p1.z)                          // t4 := Y1 + Z1
+	x3 := new(fiat.P224Element).Add(p2.y, p2.z) // X3 := Y2 + Z2
+	t4.Mul(t4, x3)                              // t4 := t4 * X3
+	x3.Add(t1, t2)                              // X3 := t1 + t2
+	t4.Sub(t4, x3)                              // t4 := t4 - X3
+	x3.Add(p1.x, p1.z)                          // X3 := X1 + Z1
+	y3 := new(fiat.P224Element).Add(p2.x, p2.z) // Y3 := X2 + Z2
+	x3.Mul(x3, y3)                              // X3 := X3 * Y3
+	y3.Add(t0, t2)                              // Y3 := t0 + t2
+	y3.Sub(x3, y3)                              // Y3 := X3 - Y3
+	z3 := new(fiat.P224Element).Mul(p224B, t2)  // Z3 := b * t2
+	x3.Sub(y3, z3)                              // X3 := Y3 - Z3
+	z3.Add(x3, x3)                              // Z3 := X3 + X3
+	x3.Add(x3, z3)                              // X3 := X3 + Z3
+	z3.Sub(t1, x3)                              // Z3 := t1 - X3
+	x3.Add(t1, x3)                              // X3 := t1 + X3
+	y3.Mul(p224B, y3)                           // Y3 := b * Y3
+	t1.Add(t2, t2)                              // t1 := t2 + t2
+	t2.Add(t1, t2)                              // t2 := t1 + t2
+	y3.Sub(y3, t2)                              // Y3 := Y3 - t2
+	y3.Sub(y3, t0)                              // Y3 := Y3 - t0
+	t1.Add(y3, y3)                              // t1 := Y3 + Y3
+	y3.Add(t1, y3)                              // Y3 := t1 + Y3
+	t1.Add(t0, t0)                              // t1 := t0 + t0
+	t0.Add(t1, t0)                              // t0 := t1 + t0
+	t0.Sub(t0, t2)                              // t0 := t0 - t2
+	t1.Mul(t4, y3)                              // t1 := t4 * Y3
+	t2.Mul(t0, y3)                              // t2 := t0 * Y3
+	y3.Mul(x3, z3)                              // Y3 := X3 * Z3
+	y3.Add(y3, t2)                              // Y3 := Y3 + t2
+	x3.Mul(t3, x3)                              // X3 := t3 * X3
+	x3.Sub(x3, t1)                              // X3 := X3 - t1
+	z3.Mul(t4, z3)                              // Z3 := t4 * Z3
+	t1.Mul(t3, t0)                              // t1 := t3 * t0
+	z3.Add(z3, t1)                              // Z3 := Z3 + t1
+
+	q.x.Set(x3)
+	q.y.Set(y3)
+	q.z.Set(z3)
+	return q
+}
+
+// Double sets q = p + p, and returns q. The points may overlap.
+func (q *P224Point) Double(p *P224Point) *P224Point {
+	// Complete addition formula for a = -3 from "Complete addition formulas for
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
+
+	t0 := new(fiat.P224Element).Square(p.x)    // t0 := X ^ 2
+	t1 := new(fiat.P224Element).Square(p.y)    // t1 := Y ^ 2
+	t2 := new(fiat.P224Element).Square(p.z)    // t2 := Z ^ 2
+	t3 := new(fiat.P224Element).Mul(p.x, p.y)  // t3 := X * Y
+	t3.Add(t3, t3)                             // t3 := t3 + t3
+	z3 := new(fiat.P224Element).Mul(p.x, p.z)  // Z3 := X * Z
+	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+	y3 := new(fiat.P224Element).Mul(p224B, t2) // Y3 := b * t2
+	y3.Sub(y3, z3)                             // Y3 := Y3 - Z3
+	x3 := new(fiat.P224Element).Add(y3, y3)    // X3 := Y3 + Y3
+	y3.Add(x3, y3)                             // Y3 := X3 + Y3
+	x3.Sub(t1, y3)                             // X3 := t1 - Y3
+	y3.Add(t1, y3)                             // Y3 := t1 + Y3
+	y3.Mul(x3, y3)                             // Y3 := X3 * Y3
+	x3.Mul(x3, t3)                             // X3 := X3 * t3
+	t3.Add(t2, t2)                             // t3 := t2 + t2
+	t2.Add(t2, t3)                             // t2 := t2 + t3
+	z3.Mul(p224B, z3)                          // Z3 := b * Z3
+	z3.Sub(z3, t2)                             // Z3 := Z3 - t2
+	z3.Sub(z3, t0)                             // Z3 := Z3 - t0
+	t3.Add(z3, z3)                             // t3 := Z3 + Z3
+	z3.Add(z3, t3)                             // Z3 := Z3 + t3
+	t3.Add(t0, t0)                             // t3 := t0 + t0
+	t0.Add(t3, t0)                             // t0 := t3 + t0
+	t0.Sub(t0, t2)                             // t0 := t0 - t2
+	t0.Mul(t0, z3)                             // t0 := t0 * Z3
+	y3.Add(y3, t0)                             // Y3 := Y3 + t0
+	t0.Mul(p.y, p.z)                           // t0 := Y * Z
+	t0.Add(t0, t0)                             // t0 := t0 + t0
+	z3.Mul(t0, z3)                             // Z3 := t0 * Z3
+	x3.Sub(x3, z3)                             // X3 := X3 - Z3
+	z3.Mul(t0, t1)                             // Z3 := t0 * t1
+	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+
+	q.x.Set(x3)
+	q.y.Set(y3)
+	q.z.Set(z3)
+	return q
+}
+
+// Select sets q to p1 if cond == 1, and to p2 if cond == 0.
+func (q *P224Point) Select(p1, p2 *P224Point, cond int) *P224Point {
+	q.x.Select(p1.x, p2.x, cond)
+	q.y.Select(p1.y, p2.y, cond)
+	q.z.Select(p1.z, p2.z, cond)
+	return q
+}
+
+// ScalarMult sets p = scalar * q, and returns p.
+func (p *P224Point) ScalarMult(q *P224Point, scalar []byte) *P224Point {
+	// table holds the first 16 multiples of q. The explicit newP224Point calls
+	// get inlined, letting the allocations live on the stack.
+	var table = [16]*P224Point{
+		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(),
+		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(),
+		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(),
+		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(),
+	}
+	for i := 1; i < 16; i++ {
+		table[i].Add(table[i-1], q)
+	}
+
+	// Instead of doing the classic double-and-add chain, we do it with a
+	// four-bit window: we double four times, and then add [0-15]P.
+	t := NewP224Point()
+	p.Set(NewP224Point())
+	for _, byte := range scalar {
+		p.Double(p)
+		p.Double(p)
+		p.Double(p)
+		p.Double(p)
+
+		for i := uint8(0); i < 16; i++ {
+			cond := subtle.ConstantTimeByteEq(byte>>4, i)
+			t.Select(table[i], t, cond)
+		}
+		p.Add(p, t)
+
+		p.Double(p)
+		p.Double(p)
+		p.Double(p)
+		p.Double(p)
+
+		for i := uint8(0); i < 16; i++ {
+			cond := subtle.ConstantTimeByteEq(byte&0b1111, i)
+			t.Select(table[i], t, cond)
+		}
+		p.Add(p, t)
+	}
+
+	return p
+}
--- a/src/crypto/elliptic/internal/nistec/p384.go
+++ b/src/crypto/elliptic/internal/nistec/p384.go
@ -0,0 +1,298 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package nistec
+
+import (
+	"crypto/elliptic/internal/fiat"
+	"crypto/subtle"
+	"errors"
+)
+
+var p384B, _ = new(fiat.P384Element).SetBytes([]byte{
+	0xb3, 0x31, 0x2f, 0xa7, 0xe2, 0x3e, 0xe7, 0xe4, 0x98, 0x8e, 0x05, 0x6b,
+	0xe3, 0xf8, 0x2d, 0x19, 0x18, 0x1d, 0x9c, 0x6e, 0xfe, 0x81, 0x41, 0x12,
+	0x03, 0x14, 0x08, 0x8f, 0x50, 0x13, 0x87, 0x5a, 0xc6, 0x56, 0x39, 0x8d,
+	0x8a, 0x2e, 0xd1, 0x9d, 0x2a, 0x85, 0xc8, 0xed, 0xd3, 0xec, 0x2a, 0xef})
+
+var p384G, _ = NewP384Point().SetBytes([]byte{0x4,
+	0xaa, 0x87, 0xca, 0x22, 0xbe, 0x8b, 0x05, 0x37, 0x8e, 0xb1, 0xc7, 0x1e,
+	0xf3, 0x20, 0xad, 0x74, 0x6e, 0x1d, 0x3b, 0x62, 0x8b, 0xa7, 0x9b, 0x98,
+	0x59, 0xf7, 0x41, 0xe0, 0x82, 0x54, 0x2a, 0x38, 0x55, 0x02, 0xf2, 0x5d,
+	0xbf, 0x55, 0x29, 0x6c, 0x3a, 0x54, 0x5e, 0x38, 0x72, 0x76, 0x0a, 0xb7,
+	0x36, 0x17, 0xde, 0x4a, 0x96, 0x26, 0x2c, 0x6f, 0x5d, 0x9e, 0x98, 0xbf,
+	0x92, 0x92, 0xdc, 0x29, 0xf8, 0xf4, 0x1d, 0xbd, 0x28, 0x9a, 0x14, 0x7c,
+	0xe9, 0xda, 0x31, 0x13, 0xb5, 0xf0, 0xb8, 0xc0, 0x0a, 0x60, 0xb1, 0xce,
+	0x1d, 0x7e, 0x81, 0x9d, 0x7a, 0x43, 0x1d, 0x7c, 0x90, 0xea, 0x0e, 0x5f})
+
+const p384ElementLength = 48
+
+// P384Point is a P-384 point. The zero value is NOT valid.
+type P384Point struct {
+	// The point is represented in projective coordinates (X:Y:Z),
+	// where x = X/Z and y = Y/Z.
+	x, y, z *fiat.P384Element
+}
+
+// NewP384Point returns a new P384Point representing the point at infinity point.
+func NewP384Point() *P384Point {
+	return &P384Point{
+		x: new(fiat.P384Element),
+		y: new(fiat.P384Element).One(),
+		z: new(fiat.P384Element),
+	}
+}
+
+// NewP384Generator returns a new P384Point set to the canonical generator.
+func NewP384Generator() *P384Point {
+	return (&P384Point{
+		x: new(fiat.P384Element),
+		y: new(fiat.P384Element),
+		z: new(fiat.P384Element),
+	}).Set(p384G)
+}
+
+// Set sets p = q and returns p.
+func (p *P384Point) Set(q *P384Point) *P384Point {
+	p.x.Set(q.x)
+	p.y.Set(q.y)
+	p.z.Set(q.z)
+	return p
+}
+
+// SetBytes sets p to the compressed, uncompressed, or infinity value encoded in
+// b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on
+// the curve, it returns nil and an error, and the receiver is unchanged.
+// Otherwise, it returns p.
+func (p *P384Point) SetBytes(b []byte) (*P384Point, error) {
+	switch {
+	// Point at infinity.
+	case len(b) == 1 && b[0] == 0:
+		return p.Set(NewP384Point()), nil
+
+	// Uncompressed form.
+	case len(b) == 1+2*p384ElementLength && b[0] == 4:
+		x, err := new(fiat.P384Element).SetBytes(b[1 : 1+p384ElementLength])
+		if err != nil {
+			return nil, err
+		}
+		y, err := new(fiat.P384Element).SetBytes(b[1+p384ElementLength:])
+		if err != nil {
+			return nil, err
+		}
+		if err := p384CheckOnCurve(x, y); err != nil {
+			return nil, err
+		}
+		p.x.Set(x)
+		p.y.Set(y)
+		p.z.One()
+		return p, nil
+
+	// Compressed form
+	case len(b) == 1+p384ElementLength && b[0] == 0:
+		return nil, errors.New("unimplemented") // TODO(filippo)
+
+	default:
+		return nil, errors.New("invalid P384 point encoding")
+	}
+}
+
+func p384CheckOnCurve(x, y *fiat.P384Element) error {
+	// x³ - 3x + b.
+	x3 := new(fiat.P384Element).Square(x)
+	x3.Mul(x3, x)
+
+	threeX := new(fiat.P384Element).Add(x, x)
+	threeX.Add(threeX, x)
+
+	x3.Sub(x3, threeX)
+	x3.Add(x3, p384B)
+
+	// y² = x³ - 3x + b
+	y2 := new(fiat.P384Element).Square(y)
+
+	if x3.Equal(y2) != 1 {
+		return errors.New("P384 point not on curve")
+	}
+	return nil
+}
+
+// Bytes returns the uncompressed or infinity encoding of p, as specified in
+// SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at
+// infinity is shorter than all other encodings.
+func (p *P384Point) Bytes() []byte {
+	// This function is outlined to make the allocations inline in the caller
+	// rather than happen on the heap.
+	var out [133]byte
+	return p.bytes(&out)
+}
+
+func (p *P384Point) bytes(out *[133]byte) []byte {
+	if p.z.IsZero() == 1 {
+		return append(out[:0], 0)
+	}
+
+	zinv := new(fiat.P384Element).Invert(p.z)
+	xx := new(fiat.P384Element).Mul(p.x, zinv)
+	yy := new(fiat.P384Element).Mul(p.y, zinv)
+
+	buf := append(out[:0], 4)
+	buf = append(buf, xx.Bytes()...)
+	buf = append(buf, yy.Bytes()...)
+	return buf
+}
+
+// Add sets q = p1 + p2, and returns q. The points may overlap.
+func (q *P384Point) Add(p1, p2 *P384Point) *P384Point {
+	// Complete addition formula for a = -3 from "Complete addition formulas for
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
+
+	t0 := new(fiat.P384Element).Mul(p1.x, p2.x) // t0 := X1 * X2
+	t1 := new(fiat.P384Element).Mul(p1.y, p2.y) // t1 := Y1 * Y2
+	t2 := new(fiat.P384Element).Mul(p1.z, p2.z) // t2 := Z1 * Z2
+	t3 := new(fiat.P384Element).Add(p1.x, p1.y) // t3 := X1 + Y1
+	t4 := new(fiat.P384Element).Add(p2.x, p2.y) // t4 := X2 + Y2
+	t3.Mul(t3, t4)                              // t3 := t3 * t4
+	t4.Add(t0, t1)                              // t4 := t0 + t1
+	t3.Sub(t3, t4)                              // t3 := t3 - t4
+	t4.Add(p1.y, p1.z)                          // t4 := Y1 + Z1
+	x3 := new(fiat.P384Element).Add(p2.y, p2.z) // X3 := Y2 + Z2
+	t4.Mul(t4, x3)                              // t4 := t4 * X3
+	x3.Add(t1, t2)                              // X3 := t1 + t2
+	t4.Sub(t4, x3)                              // t4 := t4 - X3
+	x3.Add(p1.x, p1.z)                          // X3 := X1 + Z1
+	y3 := new(fiat.P384Element).Add(p2.x, p2.z) // Y3 := X2 + Z2
+	x3.Mul(x3, y3)                              // X3 := X3 * Y3
+	y3.Add(t0, t2)                              // Y3 := t0 + t2
+	y3.Sub(x3, y3)                              // Y3 := X3 - Y3
+	z3 := new(fiat.P384Element).Mul(p384B, t2)  // Z3 := b * t2
+	x3.Sub(y3, z3)                              // X3 := Y3 - Z3
+	z3.Add(x3, x3)                              // Z3 := X3 + X3
+	x3.Add(x3, z3)                              // X3 := X3 + Z3
+	z3.Sub(t1, x3)                              // Z3 := t1 - X3
+	x3.Add(t1, x3)                              // X3 := t1 + X3
+	y3.Mul(p384B, y3)                           // Y3 := b * Y3
+	t1.Add(t2, t2)                              // t1 := t2 + t2
+	t2.Add(t1, t2)                              // t2 := t1 + t2
+	y3.Sub(y3, t2)                              // Y3 := Y3 - t2
+	y3.Sub(y3, t0)                              // Y3 := Y3 - t0
+	t1.Add(y3, y3)                              // t1 := Y3 + Y3
+	y3.Add(t1, y3)                              // Y3 := t1 + Y3
+	t1.Add(t0, t0)                              // t1 := t0 + t0
+	t0.Add(t1, t0)                              // t0 := t1 + t0
+	t0.Sub(t0, t2)                              // t0 := t0 - t2
+	t1.Mul(t4, y3)                              // t1 := t4 * Y3
+	t2.Mul(t0, y3)                              // t2 := t0 * Y3
+	y3.Mul(x3, z3)                              // Y3 := X3 * Z3
+	y3.Add(y3, t2)                              // Y3 := Y3 + t2
+	x3.Mul(t3, x3)                              // X3 := t3 * X3
+	x3.Sub(x3, t1)                              // X3 := X3 - t1
+	z3.Mul(t4, z3)                              // Z3 := t4 * Z3
+	t1.Mul(t3, t0)                              // t1 := t3 * t0
+	z3.Add(z3, t1)                              // Z3 := Z3 + t1
+
+	q.x.Set(x3)
+	q.y.Set(y3)
+	q.z.Set(z3)
+	return q
+}
+
+// Double sets q = p + p, and returns q. The points may overlap.
+func (q *P384Point) Double(p *P384Point) *P384Point {
+	// Complete addition formula for a = -3 from "Complete addition formulas for
+	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
+
+	t0 := new(fiat.P384Element).Square(p.x)    // t0 := X ^ 2
+	t1 := new(fiat.P384Element).Square(p.y)    // t1 := Y ^ 2
+	t2 := new(fiat.P384Element).Square(p.z)    // t2 := Z ^ 2
+	t3 := new(fiat.P384Element).Mul(p.x, p.y)  // t3 := X * Y
+	t3.Add(t3, t3)                             // t3 := t3 + t3
+	z3 := new(fiat.P384Element).Mul(p.x, p.z)  // Z3 := X * Z
+	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+	y3 := new(fiat.P384Element).Mul(p384B, t2) // Y3 := b * t2
+	y3.Sub(y3, z3)                             // Y3 := Y3 - Z3
+	x3 := new(fiat.P384Element).Add(y3, y3)    // X3 := Y3 + Y3
+	y3.Add(x3, y3)                             // Y3 := X3 + Y3
+	x3.Sub(t1, y3)                             // X3 := t1 - Y3
+	y3.Add(t1, y3)                             // Y3 := t1 + Y3
+	y3.Mul(x3, y3)                             // Y3 := X3 * Y3
+	x3.Mul(x3, t3)                             // X3 := X3 * t3
+	t3.Add(t2, t2)                             // t3 := t2 + t2
+	t2.Add(t2, t3)                             // t2 := t2 + t3
+	z3.Mul(p384B, z3)                          // Z3 := b * Z3
+	z3.Sub(z3, t2)                             // Z3 := Z3 - t2
+	z3.Sub(z3, t0)                             // Z3 := Z3 - t0
+	t3.Add(z3, z3)                             // t3 := Z3 + Z3
+	z3.Add(z3, t3)                             // Z3 := Z3 + t3
+	t3.Add(t0, t0)                             // t3 := t0 + t0
+	t0.Add(t3, t0)                             // t0 := t3 + t0
+	t0.Sub(t0, t2)                             // t0 := t0 - t2
+	t0.Mul(t0, z3)                             // t0 := t0 * Z3
+	y3.Add(y3, t0)                             // Y3 := Y3 + t0
+	t0.Mul(p.y, p.z)                           // t0 := Y * Z
+	t0.Add(t0, t0)                             // t0 := t0 + t0
+	z3.Mul(t0, z3)                             // Z3 := t0 * Z3
+	x3.Sub(x3, z3)                             // X3 := X3 - Z3
+	z3.Mul(t0, t1)                             // Z3 := t0 * t1
+	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+
+	q.x.Set(x3)
+	q.y.Set(y3)
+	q.z.Set(z3)
+	return q
+}
+
+// Select sets q to p1 if cond == 1, and to p2 if cond == 0.
+func (q *P384Point) Select(p1, p2 *P384Point, cond int) *P384Point {
+	q.x.Select(p1.x, p2.x, cond)
+	q.y.Select(p1.y, p2.y, cond)
+	q.z.Select(p1.z, p2.z, cond)
+	return q
+}
+
+// ScalarMult sets p = scalar * q, and returns p.
+func (p *P384Point) ScalarMult(q *P384Point, scalar []byte) *P384Point {
+	// table holds the first 16 multiples of q. The explicit newP384Point calls
+	// get inlined, letting the allocations live on the stack.
+	var table = [16]*P384Point{
+		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(),
+		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(),
+		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(),
+		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(),
+	}
+	for i := 1; i < 16; i++ {
+		table[i].Add(table[i-1], q)
+	}
+
+	// Instead of doing the classic double-and-add chain, we do it with a
+	// four-bit window: we double four times, and then add [0-15]P.
+	t := NewP384Point()
+	p.Set(NewP384Point())
+	for _, byte := range scalar {
+		p.Double(p)
+		p.Double(p)
+		p.Double(p)
+		p.Double(p)
+
+		for i := uint8(0); i < 16; i++ {
+			cond := subtle.ConstantTimeByteEq(byte>>4, i)
+			t.Select(table[i], t, cond)
+		}
+		p.Add(p, t)
+
+		p.Double(p)
+		p.Double(p)
+		p.Double(p)
+		p.Double(p)
+
+		for i := uint8(0); i < 16; i++ {
+			cond := subtle.ConstantTimeByteEq(byte&0b1111, i)
+			t.Select(table[i], t, cond)
+		}
+		p.Add(p, t)
+	}
+
+	return p
+}
--- a/src/crypto/elliptic/internal/nistec/p521.go
+++ b/src/crypto/elliptic/internal/nistec/p521.go
@ -58,7 +58,11 @@ func NewP521Point() *P521Point {

 // NewP521Generator returns a new P521Point set to the canonical generator.
 func NewP521Generator() *P521Point {
-	return NewP521Point().Set(p521G)
+	return (&P521Point{
+		x: new(fiat.P521Element),
+		y: new(fiat.P521Element),
+		z: new(fiat.P521Element),
+	}).Set(p521G)
 }

 // Set sets p = q and returns p.
--- a/src/crypto/elliptic/internal/nistec/p521_test.go
+++ b/src/crypto/elliptic/internal/nistec/p521_test.go
@ -1,44 +0,0 @@
-// Copyright 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package nistec_test
-
-import (
-	"crypto/elliptic/internal/nistec"
-	"math/rand"
-	"os"
-	"strings"
-	"testing"
-)
-
-func TestP521Allocations(t *testing.T) {
-	if strings.HasSuffix(os.Getenv("GO_BUILDER_NAME"), "-noopt") {
-		t.Skip("skipping allocations test without relevant optimizations")
-	}
-	if allocs := testing.AllocsPerRun(100, func() {
-		p := nistec.NewP521Generator()
-		scalar := make([]byte, 66)
-		rand.Read(scalar)
-		p.ScalarMult(p, scalar)
-		out := p.Bytes()
-		if _, err := p.SetBytes(out); err != nil {
-			t.Fatal(err)
-		}
-	}); allocs > 0 {
-		t.Errorf("expected zero allocations, got %0.1f", allocs)
-	}
-}
-
-func BenchmarkScalarMult(b *testing.B) {
-	b.Run("P521", func(b *testing.B) {
-		scalar := make([]byte, 66)
-		rand.Read(scalar)
-		p := nistec.NewP521Generator()
-		b.ReportAllocs()
-		b.ResetTimer()
-		for i := 0; i < b.N; i++ {
-			p.ScalarMult(p, scalar)
-		}
-	})
-}
--- a/src/crypto/elliptic/p224.go
+++ b/src/crypto/elliptic/p224.go
@ -1,739 +1,136 @@
-// Copyright 2012 The Go Authors. All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package elliptic

-// This is a constant-time, 32-bit implementation of P224. See FIPS 186-3,
-// section D.2.2.
-//
-// See https://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
-
 import (
-	"encoding/binary"
+	"crypto/elliptic/internal/nistec"
+	"crypto/rand"
 	"math/big"
-	"math/bits"
 )

-var p224 p224Curve
-
+// p224Curve is a Curve implementation based on nistec.P224Point.
+//
+// It's a wrapper that exposes the big.Int-based Curve interface and encodes the
+// legacy idiosyncrasies it requires, such as invalid and infinity point
+// handling.
+//
+// To interact with the nistec package, points are encoded into and decoded from
+// properly formatted byte slices. All big.Int use is limited to this package.
+// Encoding and decoding is 1/1000th of the runtime of a scalar multiplication,
+// so the overhead is acceptable.
 type p224Curve struct {
-	*CurveParams
-	gx, gy, b p224FieldElement
+	params *CurveParams
 }

+var p224 p224Curve
+var _ Curve = p224
+
 func initP224() {
-	// See FIPS 186-3, section D.2.2
-	p224.CurveParams = &CurveParams{Name: "P-224"}
-	p224.P, _ = new(big.Int).SetString("26959946667150639794667015087019630673557916260026308143510066298881", 10)
-	p224.N, _ = new(big.Int).SetString("26959946667150639794667015087019625940457807714424391721682722368061", 10)
-	p224.B, _ = new(big.Int).SetString("b4050a850c04b3abf54132565044b0b7d7bfd8ba270b39432355ffb4", 16)
-	p224.Gx, _ = new(big.Int).SetString("b70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21", 16)
-	p224.Gy, _ = new(big.Int).SetString("bd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34", 16)
-	p224.BitSize = 224
-
-	p224FromBig(&p224.gx, p224.Gx)
-	p224FromBig(&p224.gy, p224.Gy)
-	p224FromBig(&p224.b, p224.B)
-}
-
-// P224 returns a Curve which implements P-224 (see FIPS 186-3, section D.2.2).
-//
-// The cryptographic operations are implemented using constant-time algorithms.
-func P224() Curve {
-	initonce.Do(initAll)
-	return p224
+	p224.params = &CurveParams{
+		Name:    "P-224",
+		BitSize: 224,
+		// FIPS 186-4, section D.1.2.2
+		P:  bigFromDecimal("26959946667150639794667015087019630673557916260026308143510066298881"),
+		N:  bigFromDecimal("26959946667150639794667015087019625940457807714424391721682722368061"),
+		B:  bigFromHex("b4050a850c04b3abf54132565044b0b7d7bfd8ba270b39432355ffb4"),
+		Gx: bigFromHex("b70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21"),
+		Gy: bigFromHex("bd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34"),
+	}
 }

 func (curve p224Curve) Params() *CurveParams {
-	return curve.CurveParams
+	return curve.params
 }

-func (curve p224Curve) IsOnCurve(bigX, bigY *big.Int) bool {
-	if bigX.BitLen() > 224 || bigY.BitLen() > 224 {
+func (curve p224Curve) IsOnCurve(x, y *big.Int) bool {
+	// IsOnCurve is documented to reject (0, 0), the conventional point at
+	// infinity, which however is accepted by p224PointFromAffine.
+	if x.Sign() == 0 && y.Sign() == 0 {
 		return false
 	}
-
-	var x, y p224FieldElement
-	p224FromBig(&x, bigX)
-	p224FromBig(&y, bigY)
-
-	// y² = x³ - 3x + b
-	var tmp p224LargeFieldElement
-	var x3 p224FieldElement
-	p224Square(&x3, &x, &tmp)
-	p224Mul(&x3, &x3, &x, &tmp)
-
-	for i := 0; i < 8; i++ {
-		x[i] *= 3
-	}
-	p224Sub(&x3, &x3, &x)
-	p224Reduce(&x3)
-	p224Add(&x3, &x3, &curve.b)
-	p224Contract(&x3, &x3)
-
-	p224Square(&y, &y, &tmp)
-	p224Contract(&y, &y)
-
-	for i := 0; i < 8; i++ {
-		if y[i] != x3[i] {
-			return false
-		}
-	}
-	return true
+	_, ok := p224PointFromAffine(x, y)
+	return ok
 }

-func (p224Curve) Add(bigX1, bigY1, bigX2, bigY2 *big.Int) (x, y *big.Int) {
-	var x1, y1, z1, x2, y2, z2, x3, y3, z3 p224FieldElement
-
-	p224FromBig(&x1, bigX1)
-	p224FromBig(&y1, bigY1)
-	if bigX1.Sign() != 0 || bigY1.Sign() != 0 {
-		z1[0] = 1
+func p224PointFromAffine(x, y *big.Int) (p *nistec.P224Point, ok bool) {
+	// (0, 0) is by convention the point at infinity, which can't be represented
+	// in affine coordinates. Marshal incorrectly encodes it as an uncompressed
+	// point, which SetBytes would correctly reject. See Issue 37294.
+	if x.Sign() == 0 && y.Sign() == 0 {
+		return nistec.NewP224Point(), true
 	}
-	p224FromBig(&x2, bigX2)
-	p224FromBig(&y2, bigY2)
-	if bigX2.Sign() != 0 || bigY2.Sign() != 0 {
-		z2[0] = 1
+	if x.BitLen() > 224 || y.BitLen() > 224 {
+		return nil, false
 	}
-
-	p224AddJacobian(&x3, &y3, &z3, &x1, &y1, &z1, &x2, &y2, &z2)
-	return p224ToAffine(&x3, &y3, &z3)
+	p, err := nistec.NewP224Point().SetBytes(Marshal(P224(), x, y))
+	if err != nil {
+		return nil, false
+	}
+	return p, true
 }

-func (p224Curve) Double(bigX1, bigY1 *big.Int) (x, y *big.Int) {
-	var x1, y1, z1, x2, y2, z2 p224FieldElement
-
-	p224FromBig(&x1, bigX1)
-	p224FromBig(&y1, bigY1)
-	z1[0] = 1
-
-	p224DoubleJacobian(&x2, &y2, &z2, &x1, &y1, &z1)
-	return p224ToAffine(&x2, &y2, &z2)
-}
-
-func (p224Curve) ScalarMult(bigX1, bigY1 *big.Int, scalar []byte) (x, y *big.Int) {
-	var x1, y1, z1, x2, y2, z2 p224FieldElement
-
-	p224FromBig(&x1, bigX1)
-	p224FromBig(&y1, bigY1)
-	z1[0] = 1
-
-	p224ScalarMult(&x2, &y2, &z2, &x1, &y1, &z1, scalar)
-	return p224ToAffine(&x2, &y2, &z2)
-}
-
-func (curve p224Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
-	var z1, x2, y2, z2 p224FieldElement
-
-	z1[0] = 1
-	p224ScalarMult(&x2, &y2, &z2, &curve.gx, &curve.gy, &z1, scalar)
-	return p224ToAffine(&x2, &y2, &z2)
-}
-
-// Field element functions.
-//
-// The field that we're dealing with is ℤ/pℤ where p = 2**224 - 2**96 + 1.
-//
-// Field elements are represented by a FieldElement, which is a typedef to an
-// array of 8 uint32's. The value of a FieldElement, a, is:
-//   a[0] + 2**28·a[1] + 2**56·a[1] + ... + 2**196·a[7]
-//
-// Using 28-bit limbs means that there's only 4 bits of headroom, which is less
-// than we would really like. But it has the useful feature that we hit 2**224
-// exactly, making the reflections during a reduce much nicer.
-type p224FieldElement [8]uint32
-
-// p224P is the order of the field, represented as a p224FieldElement.
-var p224P = p224FieldElement{1, 0, 0, 0xffff000, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff}
-
-// p224IsZero returns 1 if a == 0 mod p and 0 otherwise.
-//
-// a[i] < 2**29
-func p224IsZero(a *p224FieldElement) uint32 {
-	var minimal p224FieldElement
-	p224Contract(&minimal, a)
-
-	var acc uint32
-	for _, v := range minimal {
-		acc |= v
-	}
-	mask := ^maskIsNotZero(acc)
-
-	return 1 & mask
-}
-
-// p224Add computes *out = a+b
-//
-// a[i] + b[i] < 2**32
-func p224Add(out, a, b *p224FieldElement) {
-	for i := 0; i < 8; i++ {
-		out[i] = a[i] + b[i]
-	}
-}
-
-const two31p3 = 1<<31 + 1<<3
-const two31m3 = 1<<31 - 1<<3
-const two31m15m3 = 1<<31 - 1<<15 - 1<<3
-
-// p224ZeroModP31 is 0 mod p where bit 31 is set in all limbs so that we can
-// subtract smaller amounts without underflow. See the section "Subtraction" in
-// [1] for reasoning.
-//
-// To calculate this value, start by adding 2³¹ to the lowest limb and
-// subtracting 2³ from the next one to compensate. Repeat for each next limb,
-// ending up with 2³¹ - 2³ in each of them, and a carry of -2³. Apply the
-// reduction identity, and we need to subtract 2³ * 2⁹⁶ - 2³ = 2¹⁵ * 2⁸⁴ - 2³ so
-// we subtract 2¹⁵ from the 4th limb and add 2³ to the first limb.
-var p224ZeroModP31 = []uint32{two31p3, two31m3, two31m3, two31m15m3, two31m3, two31m3, two31m3, two31m3}
-
-// p224Sub computes *out = a-b
-//
-// a[i], b[i] < 2**30
-// out[i] < 2**32
-func p224Sub(out, a, b *p224FieldElement) {
-	for i := 0; i < 8; i++ {
-		out[i] = a[i] + p224ZeroModP31[i] - b[i]
-	}
-}
-
-// LargeFieldElement also represents an element of the field. The limbs are
-// still spaced 28-bits apart and in little-endian order. So the limbs are at
-// 0, 28, 56, ..., 392 bits, each 64-bits wide.
-type p224LargeFieldElement [15]uint64
-
-const two63p35 = 1<<63 + 1<<35
-const two63m35 = 1<<63 - 1<<35
-const two63m35m19 = 1<<63 - 1<<35 - 1<<19
-
-// p224ZeroModP63 is 0 mod p where bit 63 is set in all limbs. See the section
-// "Subtraction" in [1] for why.
-var p224ZeroModP63 = [8]uint64{two63p35, two63m35, two63m35, two63m35, two63m35m19, two63m35, two63m35, two63m35}
-
-const bottom12Bits = 0xfff
-const bottom28Bits = 0xfffffff
-
-// p224Mul computes *out = a*b
-//
-// a[i] < 2**29, b[i] < 2**30 (or vice versa)
-// out[i] < 2**29
-func p224Mul(out, a, b *p224FieldElement, tmp *p224LargeFieldElement) {
-	for i := range tmp {
-		tmp[i] = 0
-	}
-
-	for i := 0; i < 8; i++ {
-		for j := 0; j < 8; j++ {
-			tmp[i+j] += uint64(a[i]) * uint64(b[j])
-		}
-	}
-
-	p224ReduceLarge(out, tmp)
-}
-
-// Square computes *out = a*a
-//
-// a[i] < 2**29
-// out[i] < 2**29
-func p224Square(out, a *p224FieldElement, tmp *p224LargeFieldElement) {
-	for i := range tmp {
-		tmp[i] = 0
-	}
-
-	for i := 0; i < 8; i++ {
-		for j := 0; j <= i; j++ {
-			r := uint64(a[i]) * uint64(a[j])
-			if i == j {
-				tmp[i+j] += r
-			} else {
-				tmp[i+j] += r * 2
-			}
-		}
-	}
-
-	p224ReduceLarge(out, tmp)
-}
-
-// ReduceLarge converts a p224LargeFieldElement to a p224FieldElement.
-//
-// in[i] < 2**62
-// out[i] < 2**29
-func p224ReduceLarge(out *p224FieldElement, in *p224LargeFieldElement) {
-	for i := 0; i < 8; i++ {
-		in[i] += p224ZeroModP63[i]
-	}
-
-	// Eliminate the coefficients at 2**224 and greater by applying the
-	// reduction identity.
-	//
-	//   a + top * 2²²⁴ = a + top * 2⁹⁶ - top
-	//
-	// Since top here is in[8..14], both the subtraction at offset 0 and the
-	// addition at offset 96 (3 * 28 + 16) span multiple limbs. The subtraction
-	// can't underflow because of the p224ZeroModP63 addition above, while the
-	// addition can't overflow because of the 62 bit input bounds.
-	for i := 14; i >= 8; i-- {
-		in[i-8] -= in[i]
-		in[i-5] += (in[i] & 0xffff) << 12
-		in[i-4] += in[i] >> 16
-	}
-	in[8] = 0
-	// in[0..7] < 2**64
-	// in[9..14] discarded
-
-	// Run a carry chain and light reduction. Keep [0] large so we can do the
-	// subtraction safely. As the values become small enough, we start to store
-	// them in out and use 32-bit operations.
-	for i := 1; i < 8; i++ {
-		in[i+1] += in[i] >> 28
-		out[i] = uint32(in[i] & bottom28Bits)
-	}
-	in[0] -= in[8]
-	out[3] += uint32(in[8]&0xffff) << 12
-	out[4] += uint32(in[8] >> 16)
-	// in[0] < 2**64
-	// out[3] < 2**29
-	// out[4] < 2**29
-	// out[1,2,5..7] < 2**28
-
-	// Carry the overflow of [0] into the short 28 bit limbs.
-	out[0] = uint32(in[0] & bottom28Bits)
-	out[1] += uint32((in[0] >> 28) & bottom28Bits)
-	out[2] += uint32(in[0] >> 56)
-	// out[0] < 2**28
-	// out[1..4] < 2**29
-	// out[5..7] < 2**28
-}
-
-// Reduce reduces the coefficients of a to smaller bounds.
-//
-// On entry: a[i] < 2**31 + 2**30
-// On exit: a[i] < 2**29
-func p224Reduce(a *p224FieldElement) {
-	for i := 0; i < 7; i++ {
-		a[i+1] += a[i] >> 28
-		a[i] &= bottom28Bits
-	}
-	top := a[7] >> 28
-	a[7] &= bottom28Bits
-
-	a[0] -= top
-	a[3] += top << 12
-
-	// We may have just made a[0] negative but if we did top must have been not
-	// zero, so a[3] is not zero, so we can carry down to a[0]. (Note that we
-	// don't actually check if a[0] went negative, like in p224Contract, nor we
-	// try to stop the carry at a[1] or a[2], because here we can afford to go
-	// above 28 bits, so instead we carry all the way down from a[3].)
-	mask := maskIsNotZero(top)
-	a[3] -= 1 & mask
-	a[2] += mask & (1<<28 - 1)
-	a[1] += mask & (1<<28 - 1)
-	a[0] += mask & (1 << 28)
-}
-
-// p224Invert calculates *out = in**-1 by using Fermat's little theorem and
-// computing in**(p-2) = in**(2**224 - 2**96 - 1).
-func p224Invert(out, in *p224FieldElement) {
-	var f1, f2, f3, f4 p224FieldElement
-	var c p224LargeFieldElement
-
-	p224Square(&f1, in, &c)    // 2
-	p224Mul(&f1, &f1, in, &c)  // 2**2 - 1
-	p224Square(&f1, &f1, &c)   // 2**3 - 2
-	p224Mul(&f1, &f1, in, &c)  // 2**3 - 1
-	p224Square(&f2, &f1, &c)   // 2**4 - 2
-	p224Square(&f2, &f2, &c)   // 2**5 - 4
-	p224Square(&f2, &f2, &c)   // 2**6 - 8
-	p224Mul(&f1, &f1, &f2, &c) // 2**6 - 1
-	p224Square(&f2, &f1, &c)   // 2**7 - 2
-	for i := 0; i < 5; i++ {   // 2**12 - 2**6
-		p224Square(&f2, &f2, &c)
-	}
-	p224Mul(&f2, &f2, &f1, &c) // 2**12 - 1
-	p224Square(&f3, &f2, &c)   // 2**13 - 2
-	for i := 0; i < 11; i++ {  // 2**24 - 2**12
-		p224Square(&f3, &f3, &c)
-	}
-	p224Mul(&f2, &f3, &f2, &c) // 2**24 - 1
-	p224Square(&f3, &f2, &c)   // 2**25 - 2
-	for i := 0; i < 23; i++ {  // 2**48 - 2**24
-		p224Square(&f3, &f3, &c)
-	}
-	p224Mul(&f3, &f3, &f2, &c) // 2**48 - 1
-	p224Square(&f4, &f3, &c)   // 2**49 - 2
-	for i := 0; i < 47; i++ {  // 2**96 - 2**48
-		p224Square(&f4, &f4, &c)
-	}
-	p224Mul(&f3, &f3, &f4, &c) // 2**96 - 1
-	p224Square(&f4, &f3, &c)   // 2**97 - 2
-	for i := 0; i < 23; i++ {  // 2**120 - 2**24
-		p224Square(&f4, &f4, &c)
-	}
-	p224Mul(&f2, &f4, &f2, &c) // 2**120 - 1
-	for i := 0; i < 6; i++ {   // 2**126 - 2**6
-		p224Square(&f2, &f2, &c)
-	}
-	p224Mul(&f1, &f1, &f2, &c) // 2**126 - 1
-	p224Square(&f1, &f1, &c)   // 2**127 - 2
-	p224Mul(&f1, &f1, in, &c)  // 2**127 - 1
-	for i := 0; i < 97; i++ {  // 2**224 - 2**97
-		p224Square(&f1, &f1, &c)
-	}
-	p224Mul(out, &f1, &f3, &c) // 2**224 - 2**96 - 1
-}
-
-// p224Contract converts a FieldElement to its unique, minimal form.
-//
-// On entry, in[i] < 2**29
-// On exit, out[i] < 2**28 and out < p
-func p224Contract(out, in *p224FieldElement) {
-	copy(out[:], in[:])
-
-	// First, carry the bits above 28 to the higher limb.
-	for i := 0; i < 7; i++ {
-		out[i+1] += out[i] >> 28
-		out[i] &= bottom28Bits
-	}
-	top := out[7] >> 28
-	out[7] &= bottom28Bits
-
-	// Use the reduction identity to carry the overflow.
-	//
-	//   a + top * 2²²⁴ = a + top * 2⁹⁶ - top
-	out[0] -= top
-	out[3] += top << 12
-
-	// We may just have made out[0] negative. So we carry down. If we made
-	// out[0] negative then we know that out[3] is sufficiently positive
-	// because we just added to it.
-	for i := 0; i < 3; i++ {
-		mask := maskIsNegative(out[i])
-		out[i] += (1 << 28) & mask
-		out[i+1] -= 1 & mask
-	}
-
-	// We might have pushed out[3] over 2**28 so we perform another, partial,
-	// carry chain; carry the overflow according to the reduction identity; and
-	// carry down in case we made out[0] negative.
-	for i := 3; i < 7; i++ {
-		out[i+1] += out[i] >> 28
-		out[i] &= bottom28Bits
-	}
-	top = out[7] >> 28
-	out[7] &= bottom28Bits
-
-	out[0] -= top
-	out[3] += top << 12
-
-	for i := 0; i < 3; i++ {
-		mask := maskIsNegative(out[i])
-		out[i] += (1 << 28) & mask
-		out[i+1] -= 1 & mask
-	}
-
-	// There are two cases to consider for out[3]:
-	//   1) The first time that we eliminated top, we didn't push out[3] over
-	//      2**28. In this case, the partial carry chain didn't change any values
-	//      and top is now zero.
-	//   2) We did push out[3] over 2**28 the first time that we eliminated top.
-	//      The first value of top was in [0..2], therefore, after overflowing
-	//      and being reduced by the second carry chain, out[3] <= 2<<12 - 1.
-	// In both cases, out[3] cannot have overflowed when we eliminated top for
-	// the second time.
-
-	// Now we need to subtract p if the value is >= p. To check, we subtract p
-	// with a borrow chain and look at the final borrow bit.
-	var b uint32
-	for i := 0; i < len(out); i++ {
-		_, b = bits.Sub32(out[i], p224P[i], b)
-	}
-	mask := ^maskIsNotZero(b)
-
-	out[0] -= 1 & mask
-	out[3] -= 0xffff000 & mask
-	out[4] -= 0xfffffff & mask
-	out[5] -= 0xfffffff & mask
-	out[6] -= 0xfffffff & mask
-	out[7] -= 0xfffffff & mask
-
-	// Do one final carry down, in case we made out[0] negative. One of
-	// out[0..3] needs to be positive and able to absorb the -1 or the value
-	// would have been < p, and the subtraction wouldn't have happened.
-	for i := 0; i < 3; i++ {
-		mask := maskIsNegative(out[i])
-		out[i] += (1 << 28) & mask
-		out[i+1] -= 1 & mask
-	}
-}
-
-// maskIsNegative returns 0xffffffff if the most significant bit of v is set,
-// and 0 otherwise.
-func maskIsNegative(v uint32) uint32 { return uint32(int32(v) >> 31) }
-
-// maskIfNegative returns 0xffffffff if v is not zero, and 0 otherwise.
-func maskIsNotZero(v uint32) uint32 {
-	v |= v >> 16
-	v |= v >> 8
-	v |= v >> 4
-	v |= v >> 2
-	v |= v >> 1
-	return uint32(int32(v<<31) >> 31)
-}
-
-// Group element functions.
-//
-// These functions deal with group elements. The group is an elliptic curve
-// group with a = -3 defined in FIPS 186-3, section D.2.2.
-
-// p224AddJacobian computes *out = a+b where a != b.
-func p224AddJacobian(x3, y3, z3, x1, y1, z1, x2, y2, z2 *p224FieldElement) {
-	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-p224Add-2007-bl
-	var z1z1, z2z2, u1, u2, s1, s2, h, i, j, r, v p224FieldElement
-	var c p224LargeFieldElement
-
-	z1IsZero := p224IsZero(z1)
-	z2IsZero := p224IsZero(z2)
-
-	// Z1Z1 = Z1²
-	p224Square(&z1z1, z1, &c)
-	// Z2Z2 = Z2²
-	p224Square(&z2z2, z2, &c)
-	// U1 = X1*Z2Z2
-	p224Mul(&u1, x1, &z2z2, &c)
-	// U2 = X2*Z1Z1
-	p224Mul(&u2, x2, &z1z1, &c)
-	// S1 = Y1*Z2*Z2Z2
-	p224Mul(&s1, z2, &z2z2, &c)
-	p224Mul(&s1, y1, &s1, &c)
-	// S2 = Y2*Z1*Z1Z1
-	p224Mul(&s2, z1, &z1z1, &c)
-	p224Mul(&s2, y2, &s2, &c)
-	// H = U2-U1
-	p224Sub(&h, &u2, &u1)
-	p224Reduce(&h)
-	xEqual := p224IsZero(&h)
-	// I = (2*H)²
-	for j := 0; j < 8; j++ {
-		i[j] = h[j] << 1
-	}
-	p224Reduce(&i)
-	p224Square(&i, &i, &c)
-	// J = H*I
-	p224Mul(&j, &h, &i, &c)
-	// r = 2*(S2-S1)
-	p224Sub(&r, &s2, &s1)
-	p224Reduce(&r)
-	yEqual := p224IsZero(&r)
-	if xEqual == 1 && yEqual == 1 && z1IsZero == 0 && z2IsZero == 0 {
-		p224DoubleJacobian(x3, y3, z3, x1, y1, z1)
-		return
-	}
-	for i := 0; i < 8; i++ {
-		r[i] <<= 1
-	}
-	p224Reduce(&r)
-	// V = U1*I
-	p224Mul(&v, &u1, &i, &c)
-	// Z3 = ((Z1+Z2)²-Z1Z1-Z2Z2)*H
-	p224Add(&z1z1, &z1z1, &z2z2)
-	p224Add(&z2z2, z1, z2)
-	p224Reduce(&z2z2)
-	p224Square(&z2z2, &z2z2, &c)
-	p224Sub(z3, &z2z2, &z1z1)
-	p224Reduce(z3)
-	p224Mul(z3, z3, &h, &c)
-	// X3 = r²-J-2*V
-	for i := 0; i < 8; i++ {
-		z1z1[i] = v[i] << 1
-	}
-	p224Add(&z1z1, &j, &z1z1)
-	p224Reduce(&z1z1)
-	p224Square(x3, &r, &c)
-	p224Sub(x3, x3, &z1z1)
-	p224Reduce(x3)
-	// Y3 = r*(V-X3)-2*S1*J
-	for i := 0; i < 8; i++ {
-		s1[i] <<= 1
-	}
-	p224Mul(&s1, &s1, &j, &c)
-	p224Sub(&z1z1, &v, x3)
-	p224Reduce(&z1z1)
-	p224Mul(&z1z1, &z1z1, &r, &c)
-	p224Sub(y3, &z1z1, &s1)
-	p224Reduce(y3)
-
-	p224CopyConditional(x3, x2, z1IsZero)
-	p224CopyConditional(x3, x1, z2IsZero)
-	p224CopyConditional(y3, y2, z1IsZero)
-	p224CopyConditional(y3, y1, z2IsZero)
-	p224CopyConditional(z3, z2, z1IsZero)
-	p224CopyConditional(z3, z1, z2IsZero)
-}
-
-// p224DoubleJacobian computes *out = a+a.
-func p224DoubleJacobian(x3, y3, z3, x1, y1, z1 *p224FieldElement) {
-	var delta, gamma, beta, alpha, t p224FieldElement
-	var c p224LargeFieldElement
-
-	p224Square(&delta, z1, &c)
-	p224Square(&gamma, y1, &c)
-	p224Mul(&beta, x1, &gamma, &c)
-
-	// alpha = 3*(X1-delta)*(X1+delta)
-	p224Add(&t, x1, &delta)
-	for i := 0; i < 8; i++ {
-		t[i] += t[i] << 1
-	}
-	p224Reduce(&t)
-	p224Sub(&alpha, x1, &delta)
-	p224Reduce(&alpha)
-	p224Mul(&alpha, &alpha, &t, &c)
-
-	// Z3 = (Y1+Z1)²-gamma-delta
-	p224Add(z3, y1, z1)
-	p224Reduce(z3)
-	p224Square(z3, z3, &c)
-	p224Sub(z3, z3, &gamma)
-	p224Reduce(z3)
-	p224Sub(z3, z3, &delta)
-	p224Reduce(z3)
-
-	// X3 = alpha²-8*beta
-	for i := 0; i < 8; i++ {
-		delta[i] = beta[i] << 3
-	}
-	p224Reduce(&delta)
-	p224Square(x3, &alpha, &c)
-	p224Sub(x3, x3, &delta)
-	p224Reduce(x3)
-
-	// Y3 = alpha*(4*beta-X3)-8*gamma²
-	for i := 0; i < 8; i++ {
-		beta[i] <<= 2
-	}
-	p224Sub(&beta, &beta, x3)
-	p224Reduce(&beta)
-	p224Square(&gamma, &gamma, &c)
-	for i := 0; i < 8; i++ {
-		gamma[i] <<= 3
-	}
-	p224Reduce(&gamma)
-	p224Mul(y3, &alpha, &beta, &c)
-	p224Sub(y3, y3, &gamma)
-	p224Reduce(y3)
-}
-
-// p224CopyConditional sets *out = *in in constant time if control is not zero.
-func p224CopyConditional(out, in *p224FieldElement, control uint32) {
-	mask := maskIsNotZero(control)
-	for i := 0; i < 8; i++ {
-		out[i] ^= (out[i] ^ in[i]) & mask
-	}
-}
-
-func p224ScalarMult(outX, outY, outZ, inX, inY, inZ *p224FieldElement, scalar []byte) {
-	var xx, yy, zz p224FieldElement
-	for i := 0; i < 8; i++ {
-		outX[i] = 0
-		outY[i] = 0
-		outZ[i] = 0
-	}
-
-	for _, byte := range scalar {
-		for bitNum := uint(0); bitNum < 8; bitNum++ {
-			p224DoubleJacobian(outX, outY, outZ, outX, outY, outZ)
-			bit := uint32((byte >> (7 - bitNum)) & 1)
-			p224AddJacobian(&xx, &yy, &zz, inX, inY, inZ, outX, outY, outZ)
-			p224CopyConditional(outX, &xx, bit)
-			p224CopyConditional(outY, &yy, bit)
-			p224CopyConditional(outZ, &zz, bit)
-		}
-	}
-}
-
-// p224ToAffine converts from Jacobian to affine form.
-func p224ToAffine(x, y, z *p224FieldElement) (*big.Int, *big.Int) {
-	var zinv, zinvsq, outx, outy p224FieldElement
-	var tmp p224LargeFieldElement
-
-	if isPointAtInfinity := p224IsZero(z); isPointAtInfinity == 1 {
+func p224PointToAffine(p *nistec.P224Point) (x, y *big.Int) {
+	out := p.Bytes()
+	if len(out) == 1 && out[0] == 0 {
+		// This is the correct encoding of the point at infinity, which
+		// Unmarshal does not support. See Issue 37294.
 		return new(big.Int), new(big.Int)
 	}
-
-	p224Invert(&zinv, z)
-	p224Square(&zinvsq, &zinv, &tmp)
-	p224Mul(x, x, &zinvsq, &tmp)
-	p224Mul(&zinvsq, &zinvsq, &zinv, &tmp)
-	p224Mul(y, y, &zinvsq, &tmp)
-
-	p224Contract(&outx, x)
-	p224Contract(&outy, y)
-	return p224ToBig(&outx), p224ToBig(&outy)
-}
-
-// get28BitsFromEnd returns the least-significant 28 bits from buf>>shift,
-// where buf is interpreted as a big-endian number. shift must be at most
-// 4 bits higher than a multiple of 8.
-func get28BitsFromEnd(buf []byte, shift int) uint32 {
-	buf = buf[:len(buf)-shift/8]
-	shift = shift % 8
-	if shift > 4 {
-		panic("misuse of get28BitsFromEnd")
+	x, y = Unmarshal(P224(), out)
+	if x == nil {
+		panic("crypto/elliptic: internal error: Unmarshal rejected a valid point encoding")
 	}
-
-	ret := binary.BigEndian.Uint32(buf[len(buf)-4:])
-	ret >>= shift
-	ret &= bottom28Bits
-	return ret
+	return x, y
 }

-// p224FromBig sets *out = *in.
-func p224FromBig(out *p224FieldElement, in *big.Int) {
-	bytes := in.FillBytes(make([]byte, 224/8))
-	for i := range out {
-		out[i] = get28BitsFromEnd(bytes, 28*i)
+// p224RandomPoint returns a random point on the curve. It's used when Add,
+// Double, or ScalarMult are fed a point not on the curve, which is undefined
+// behavior. Originally, we used to do the math on it anyway (which allows
+// invalid curve attacks) and relied on the caller and Unmarshal to avoid this
+// happening in the first place. Now, we just can't construct a nistec.P224Point
+// for an invalid pair of coordinates, because that API is safer. If we panic,
+// we risk introducing a DoS. If we return nil, we risk a panic. If we return
+// the input, ecdsa.Verify might fail open. The safest course seems to be to
+// return a valid, random point, which hopefully won't help the attacker.
+func p224RandomPoint() (x, y *big.Int) {
+	_, x, y, err := GenerateKey(P224(), rand.Reader)
+	if err != nil {
+		panic("crypto/elliptic: failed to generate random point")
 	}
+	return x, y
 }

-// p224ToBig returns in as a big.Int.
-func p224ToBig(in *p224FieldElement) *big.Int {
-	var buf [28]byte
-	buf[27] = byte(in[0])
-	buf[26] = byte(in[0] >> 8)
-	buf[25] = byte(in[0] >> 16)
-	buf[24] = byte(((in[0] >> 24) & 0x0f) | (in[1]<<4)&0xf0)
-
-	buf[23] = byte(in[1] >> 4)
-	buf[22] = byte(in[1] >> 12)
-	buf[21] = byte(in[1] >> 20)
-
-	buf[20] = byte(in[2])
-	buf[19] = byte(in[2] >> 8)
-	buf[18] = byte(in[2] >> 16)
-	buf[17] = byte(((in[2] >> 24) & 0x0f) | (in[3]<<4)&0xf0)
-
-	buf[16] = byte(in[3] >> 4)
-	buf[15] = byte(in[3] >> 12)
-	buf[14] = byte(in[3] >> 20)
-
-	buf[13] = byte(in[4])
-	buf[12] = byte(in[4] >> 8)
-	buf[11] = byte(in[4] >> 16)
-	buf[10] = byte(((in[4] >> 24) & 0x0f) | (in[5]<<4)&0xf0)
-
-	buf[9] = byte(in[5] >> 4)
-	buf[8] = byte(in[5] >> 12)
-	buf[7] = byte(in[5] >> 20)
-
-	buf[6] = byte(in[6])
-	buf[5] = byte(in[6] >> 8)
-	buf[4] = byte(in[6] >> 16)
-	buf[3] = byte(((in[6] >> 24) & 0x0f) | (in[7]<<4)&0xf0)
-
-	buf[2] = byte(in[7] >> 4)
-	buf[1] = byte(in[7] >> 12)
-	buf[0] = byte(in[7] >> 20)
-
-	return new(big.Int).SetBytes(buf[:])
+func (p224Curve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
+	p1, ok := p224PointFromAffine(x1, y1)
+	if !ok {
+		return p224RandomPoint()
+	}
+	p2, ok := p224PointFromAffine(x2, y2)
+	if !ok {
+		return p224RandomPoint()
+	}
+	return p224PointToAffine(p1.Add(p1, p2))
+}
+
+func (p224Curve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
+	p, ok := p224PointFromAffine(x1, y1)
+	if !ok {
+		return p224RandomPoint()
+	}
+	return p224PointToAffine(p.Double(p))
+}
+
+func (p224Curve) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *big.Int) {
+	p, ok := p224PointFromAffine(Bx, By)
+	if !ok {
+		return p224RandomPoint()
+	}
+	return p224PointToAffine(p.ScalarMult(p, scalar))
+}
+
+func (p224Curve) ScalarBaseMult(scalar []byte) (*big.Int, *big.Int) {
+	p := nistec.NewP224Generator()
+	return p224PointToAffine(p.ScalarMult(p, scalar))
 }
--- a/src/crypto/elliptic/p224_test.go
+++ b/src/crypto/elliptic/p224_test.go
@ -8,313 +8,9 @@ import (
 	"encoding/hex"
 	"fmt"
 	"math/big"
-	"math/bits"
-	"math/rand"
-	"reflect"
 	"testing"
-	"testing/quick"
 )

-var toFromBigTests = []string{
-	"0",
-	"1",
-	"23",
-	"b70e0cb46bb4bf7f321390b94a03c1d356c01122343280d6105c1d21",
-	"706a46d476dcb76798e6046d89474788d164c18032d268fd10704fa6",
-}
-
-func p224AlternativeToBig(in *p224FieldElement) *big.Int {
-	ret := new(big.Int)
-	tmp := new(big.Int)
-
-	for i := len(in) - 1; i >= 0; i-- {
-		ret.Lsh(ret, 28)
-		tmp.SetInt64(int64(in[i]))
-		ret.Add(ret, tmp)
-	}
-	ret.Mod(ret, P224().Params().P)
-	return ret
-}
-
-func TestP224ToFromBig(t *testing.T) {
-	for i, test := range toFromBigTests {
-		n, _ := new(big.Int).SetString(test, 16)
-		var x p224FieldElement
-		p224FromBig(&x, n)
-		m := p224ToBig(&x)
-		if n.Cmp(m) != 0 {
-			t.Errorf("#%d: %x != %x", i, n, m)
-		}
-		q := p224AlternativeToBig(&x)
-		if n.Cmp(q) != 0 {
-			t.Errorf("#%d: %x != %x (alternative)", i, n, q)
-		}
-	}
-}
-
-// quickCheckConfig32 will make each quickcheck test run (32 * -quickchecks)
-// times. The default value of -quickchecks is 100.
-var quickCheckConfig32 = &quick.Config{MaxCountScale: 32}
-
-// weirdLimbs can be combined to generate a range of edge-case field elements.
-var weirdLimbs = [...]uint32{
-	0, 1, (1 << 29) - 1,
-	(1 << 12), (1 << 12) - 1,
-	(1 << 28), (1 << 28) - 1,
-}
-
-func generateLimb(rand *rand.Rand) uint32 {
-	const bottom29Bits = 0x1fffffff
-	n := rand.Intn(len(weirdLimbs) + 3)
-	switch n {
-	case len(weirdLimbs):
-		// Random value.
-		return uint32(rand.Int31n(1 << 29))
-	case len(weirdLimbs) + 1:
-		// Sum of two values.
-		k := generateLimb(rand) + generateLimb(rand)
-		return k & bottom29Bits
-	case len(weirdLimbs) + 2:
-		// Difference of two values.
-		k := generateLimb(rand) - generateLimb(rand)
-		return k & bottom29Bits
-	default:
-		return weirdLimbs[n]
-	}
-}
-
-func (p224FieldElement) Generate(rand *rand.Rand, size int) reflect.Value {
-	return reflect.ValueOf(p224FieldElement{
-		generateLimb(rand),
-		generateLimb(rand),
-		generateLimb(rand),
-		generateLimb(rand),
-		generateLimb(rand),
-		generateLimb(rand),
-		generateLimb(rand),
-		generateLimb(rand),
-	})
-}
-
-func isInBounds(x *p224FieldElement) bool {
-	return bits.Len32(x[0]) <= 29 &&
-		bits.Len32(x[1]) <= 29 &&
-		bits.Len32(x[2]) <= 29 &&
-		bits.Len32(x[3]) <= 29 &&
-		bits.Len32(x[4]) <= 29 &&
-		bits.Len32(x[5]) <= 29 &&
-		bits.Len32(x[6]) <= 29 &&
-		bits.Len32(x[7]) <= 29
-}
-
-func TestP224Mul(t *testing.T) {
-	mulMatchesBigInt := func(a, b, out p224FieldElement) bool {
-		var tmp p224LargeFieldElement
-		p224Mul(&out, &a, &b, &tmp)
-
-		exp := new(big.Int).Mul(p224AlternativeToBig(&a), p224AlternativeToBig(&b))
-		exp.Mod(exp, P224().Params().P)
-		got := p224AlternativeToBig(&out)
-		if exp.Cmp(got) != 0 || !isInBounds(&out) {
-			t.Logf("a = %x", a)
-			t.Logf("b = %x", b)
-			t.Logf("p224Mul(a, b) = %x = %v", out, got)
-			t.Logf("a * b = %v", exp)
-			return false
-		}
-
-		return true
-	}
-
-	a := p224FieldElement{0xfffffff, 0xfffffff, 0xf00ffff, 0x20f, 0x0, 0x0, 0x0, 0x0}
-	b := p224FieldElement{1, 0, 0, 0, 0, 0, 0, 0}
-	if !mulMatchesBigInt(a, b, p224FieldElement{}) {
-		t.Fail()
-	}
-
-	if err := quick.Check(mulMatchesBigInt, quickCheckConfig32); err != nil {
-		t.Error(err)
-	}
-}
-
-func TestP224Square(t *testing.T) {
-	squareMatchesBigInt := func(a, out p224FieldElement) bool {
-		var tmp p224LargeFieldElement
-		p224Square(&out, &a, &tmp)
-
-		exp := p224AlternativeToBig(&a)
-		exp.Mul(exp, exp)
-		exp.Mod(exp, P224().Params().P)
-		got := p224AlternativeToBig(&out)
-		if exp.Cmp(got) != 0 || !isInBounds(&out) {
-			t.Logf("a = %x", a)
-			t.Logf("p224Square(a, b) = %x = %v", out, got)
-			t.Logf("a * a = %v", exp)
-			return false
-		}
-
-		return true
-	}
-
-	if err := quick.Check(squareMatchesBigInt, quickCheckConfig32); err != nil {
-		t.Error(err)
-	}
-}
-
-func TestP224Add(t *testing.T) {
-	addMatchesBigInt := func(a, b, out p224FieldElement) bool {
-		p224Add(&out, &a, &b)
-
-		exp := new(big.Int).Add(p224AlternativeToBig(&a), p224AlternativeToBig(&b))
-		exp.Mod(exp, P224().Params().P)
-		got := p224AlternativeToBig(&out)
-		if exp.Cmp(got) != 0 {
-			t.Logf("a = %x", a)
-			t.Logf("b = %x", b)
-			t.Logf("p224Add(a, b) = %x = %v", out, got)
-			t.Logf("a + b = %v", exp)
-			return false
-		}
-
-		return true
-	}
-
-	if err := quick.Check(addMatchesBigInt, quickCheckConfig32); err != nil {
-		t.Error(err)
-	}
-}
-
-func TestP224Reduce(t *testing.T) {
-	reduceMatchesBigInt := func(a p224FieldElement) bool {
-		out := a
-		// TODO: generate higher values for functions like p224Reduce that are
-		// expected to work with higher input bounds.
-		p224Reduce(&out)
-
-		exp := p224AlternativeToBig(&a)
-		got := p224AlternativeToBig(&out)
-		if exp.Cmp(got) != 0 || !isInBounds(&out) {
-			t.Logf("a = %x = %v", a, exp)
-			t.Logf("p224Reduce(a) = %x = %v", out, got)
-			return false
-		}
-
-		return true
-	}
-
-	if err := quick.Check(reduceMatchesBigInt, quickCheckConfig32); err != nil {
-		t.Error(err)
-	}
-}
-
-func TestP224Contract(t *testing.T) {
-	contractMatchesBigInt := func(a, out p224FieldElement) bool {
-		p224Contract(&out, &a)
-
-		exp := p224AlternativeToBig(&a)
-		got := p224AlternativeToBig(&out)
-		if exp.Cmp(got) != 0 {
-			t.Logf("a = %x = %v", a, exp)
-			t.Logf("p224Contract(a) = %x = %v", out, got)
-			return false
-		}
-
-		// Check that out < P.
-		for i := range p224P {
-			k := 8 - i - 1
-			if out[k] > p224P[k] {
-				t.Logf("p224Contract(a) = %x", out)
-				return false
-			}
-			if out[k] < p224P[k] {
-				return true
-			}
-		}
-		t.Logf("p224Contract(a) = %x", out)
-		return false
-	}
-
-	if !contractMatchesBigInt(p224P, p224FieldElement{}) {
-		t.Error("p224Contract(p) is broken")
-	}
-	pMinus1 := p224FieldElement{0, 0, 0, 0xffff000, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff}
-	if !contractMatchesBigInt(pMinus1, p224FieldElement{}) {
-		t.Error("p224Contract(p - 1) is broken")
-	}
-	// Check that we can handle input above p, but lowest limb zero.
-	a := p224FieldElement{0, 1, 0, 0xffff000, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff}
-	if !contractMatchesBigInt(a, p224FieldElement{}) {
-		t.Error("p224Contract(p + 2²⁸) is broken")
-	}
-	// Check that we can handle input above p, but lowest three limbs zero.
-	b := p224FieldElement{0, 0, 0, 0xffff001, 0xfffffff, 0xfffffff, 0xfffffff, 0xfffffff}
-	if !contractMatchesBigInt(b, p224FieldElement{}) {
-		t.Error("p224Contract(p + 2⁸⁴) is broken")
-	}
-
-	if err := quick.Check(contractMatchesBigInt, quickCheckConfig32); err != nil {
-		t.Error(err)
-	}
-}
-
-func TestP224IsZero(t *testing.T) {
-	if got := p224IsZero(&p224FieldElement{}); got != 1 {
-		t.Errorf("p224IsZero(0) = %d, expected 1", got)
-	}
-	if got := p224IsZero(&p224P); got != 1 {
-		t.Errorf("p224IsZero(p) = %d, expected 1", got)
-	}
-	if got := p224IsZero(&p224FieldElement{1}); got != 0 {
-		t.Errorf("p224IsZero(1) = %d, expected 0", got)
-	}
-
-	isZeroMatchesBigInt := func(a p224FieldElement) bool {
-		isZero := p224IsZero(&a)
-
-		big := p224AlternativeToBig(&a)
-		if big.Sign() == 0 && isZero != 1 {
-			return false
-		}
-		if big.Sign() != 0 && isZero != 0 {
-			return false
-		}
-		return true
-	}
-
-	if err := quick.Check(isZeroMatchesBigInt, quickCheckConfig32); err != nil {
-		t.Error(err)
-	}
-}
-
-func TestP224Invert(t *testing.T) {
-	var out p224FieldElement
-
-	p224Invert(&out, &p224FieldElement{})
-	if got := p224IsZero(&out); got != 1 {
-		t.Errorf("p224Invert(0) = %x, expected 0", out)
-	}
-
-	p224Invert(&out, &p224P)
-	if got := p224IsZero(&out); got != 1 {
-		t.Errorf("p224Invert(p) = %x, expected 0", out)
-	}
-
-	p224Invert(&out, &p224FieldElement{1})
-	p224Contract(&out, &out)
-	if out != (p224FieldElement{1}) {
-		t.Errorf("p224Invert(1) = %x, expected 1", out)
-	}
-
-	var tmp p224LargeFieldElement
-	a := p224FieldElement{1, 2, 3, 4, 5, 6, 7, 8}
-	p224Invert(&out, &a)
-	p224Mul(&out, &out, &a, &tmp)
-	p224Contract(&out, &out)
-	if out != (p224FieldElement{1}) {
-		t.Errorf("p224Invert(a) * a = %x, expected 1", out)
-	}
-}
-
 type baseMultTest struct {
 	k    string
 	x, y string
@ -602,7 +298,7 @@ func TestP224BaseMult(t *testing.T) {

 func TestP224GenericBaseMult(t *testing.T) {
 	// We use the P224 CurveParams directly in order to test the generic implementation.
-	p224 := P224().Params()
+	p224 := genericParamsForCurve(P224())
 	for i, e := range p224BaseMultTests {
 		k, ok := new(big.Int).SetString(e.k, 10)
 		if !ok {
--- a/src/crypto/elliptic/p256.go
+++ b/src/crypto/elliptic/p256.go
@ -209,6 +209,8 @@ var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{

 // Field element operations:

+const bottom28Bits = 0xfffffff
+
 // nonZeroToAllOnes returns:
 //   0xffffffff for 0 < x <= 2**31
 //   0 for x == 0 or x > 2**31.
@ -269,6 +271,7 @@ const (
 	two30m2    = 1<<30 - 1<<2
 	two30p13m2 = 1<<30 + 1<<13 - 1<<2
 	two31m2    = 1<<31 - 1<<2
+	two31m3    = 1<<31 - 1<<3
 	two31p24m2 = 1<<31 + 1<<24 - 1<<2
 	two30m27m2 = 1<<30 - 1<<27 - 1<<2
 )
--- a/src/crypto/elliptic/p256_test.go
+++ b/src/crypto/elliptic/p256_test.go
@ -34,7 +34,7 @@ var p256MultTests = []scalarMultTest{

 func TestP256BaseMult(t *testing.T) {
 	p256 := P256()
-	p256Generic := p256.Params()
+	p256Generic := genericParamsForCurve(p256)

 	scalars := make([]*big.Int, 0, len(p224BaseMultTests)+1)
 	for _, e := range p224BaseMultTests {
@ -60,23 +60,6 @@ func TestP256BaseMult(t *testing.T) {

 func TestP256Mult(t *testing.T) {
 	p256 := P256()
-	p256Generic := p256.Params()
-
-	for i, e := range p224BaseMultTests {
-		x, _ := new(big.Int).SetString(e.x, 16)
-		y, _ := new(big.Int).SetString(e.y, 16)
-		k, _ := new(big.Int).SetString(e.k, 10)
-
-		xx, yy := p256.ScalarMult(x, y, k.Bytes())
-		xx2, yy2 := p256Generic.ScalarMult(x, y, k.Bytes())
-		if xx.Cmp(xx2) != 0 || yy.Cmp(yy2) != 0 {
-			t.Errorf("#%d: got (%x, %x), want (%x, %x)", i, xx, yy, xx2, yy2)
-		}
-		if testing.Short() && i > 5 {
-			break
-		}
-	}
-
 	for i, e := range p256MultTests {
 		x, _ := new(big.Int).SetString(e.xIn, 16)
 		y, _ := new(big.Int).SetString(e.yIn, 16)
--- a/src/crypto/elliptic/p384.go
+++ b/src/crypto/elliptic/p384.go
@ -0,0 +1,141 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package elliptic
+
+import (
+	"crypto/elliptic/internal/nistec"
+	"crypto/rand"
+	"math/big"
+)
+
+// p384Curve is a Curve implementation based on nistec.P384Point.
+//
+// It's a wrapper that exposes the big.Int-based Curve interface and encodes the
+// legacy idiosyncrasies it requires, such as invalid and infinity point
+// handling.
+//
+// To interact with the nistec package, points are encoded into and decoded from
+// properly formatted byte slices. All big.Int use is limited to this package.
+// Encoding and decoding is 1/1000th of the runtime of a scalar multiplication,
+// so the overhead is acceptable.
+type p384Curve struct {
+	params *CurveParams
+}
+
+var p384 p384Curve
+var _ Curve = p384
+
+func initP384() {
+	p384.params = &CurveParams{
+		Name:    "P-384",
+		BitSize: 384,
+		// FIPS 186-4, section D.1.2.4
+		P: bigFromDecimal("394020061963944792122790401001436138050797392704654" +
+			"46667948293404245721771496870329047266088258938001861606973112319"),
+		N: bigFromDecimal("394020061963944792122790401001436138050797392704654" +
+			"46667946905279627659399113263569398956308152294913554433653942643"),
+		B: bigFromHex("b3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088" +
+			"f5013875ac656398d8a2ed19d2a85c8edd3ec2aef"),
+		Gx: bigFromHex("aa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741" +
+			"e082542a385502f25dbf55296c3a545e3872760ab7"),
+		Gy: bigFromHex("3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da31" +
+			"13b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f"),
+	}
+}
+
+func (curve p384Curve) Params() *CurveParams {
+	return curve.params
+}
+
+func (curve p384Curve) IsOnCurve(x, y *big.Int) bool {
+	// IsOnCurve is documented to reject (0, 0), the conventional point at
+	// infinity, which however is accepted by p384PointFromAffine.
+	if x.Sign() == 0 && y.Sign() == 0 {
+		return false
+	}
+	_, ok := p384PointFromAffine(x, y)
+	return ok
+}
+
+func p384PointFromAffine(x, y *big.Int) (p *nistec.P384Point, ok bool) {
+	// (0, 0) is by convention the point at infinity, which can't be represented
+	// in affine coordinates. Marshal incorrectly encodes it as an uncompressed
+	// point, which SetBytes would correctly reject. See Issue 37294.
+	if x.Sign() == 0 && y.Sign() == 0 {
+		return nistec.NewP384Point(), true
+	}
+	if x.BitLen() > 384 || y.BitLen() > 384 {
+		return nil, false
+	}
+	p, err := nistec.NewP384Point().SetBytes(Marshal(P384(), x, y))
+	if err != nil {
+		return nil, false
+	}
+	return p, true
+}
+
+func p384PointToAffine(p *nistec.P384Point) (x, y *big.Int) {
+	out := p.Bytes()
+	if len(out) == 1 && out[0] == 0 {
+		// This is the correct encoding of the point at infinity, which
+		// Unmarshal does not support. See Issue 37294.
+		return new(big.Int), new(big.Int)
+	}
+	x, y = Unmarshal(P384(), out)
+	if x == nil {
+		panic("crypto/elliptic: internal error: Unmarshal rejected a valid point encoding")
+	}
+	return x, y
+}
+
+// p384RandomPoint returns a random point on the curve. It's used when Add,
+// Double, or ScalarMult are fed a point not on the curve, which is undefined
+// behavior. Originally, we used to do the math on it anyway (which allows
+// invalid curve attacks) and relied on the caller and Unmarshal to avoid this
+// happening in the first place. Now, we just can't construct a nistec.P384Point
+// for an invalid pair of coordinates, because that API is safer. If we panic,
+// we risk introducing a DoS. If we return nil, we risk a panic. If we return
+// the input, ecdsa.Verify might fail open. The safest course seems to be to
+// return a valid, random point, which hopefully won't help the attacker.
+func p384RandomPoint() (x, y *big.Int) {
+	_, x, y, err := GenerateKey(P384(), rand.Reader)
+	if err != nil {
+		panic("crypto/elliptic: failed to generate random point")
+	}
+	return x, y
+}
+
+func (p384Curve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
+	p1, ok := p384PointFromAffine(x1, y1)
+	if !ok {
+		return p384RandomPoint()
+	}
+	p2, ok := p384PointFromAffine(x2, y2)
+	if !ok {
+		return p384RandomPoint()
+	}
+	return p384PointToAffine(p1.Add(p1, p2))
+}
+
+func (p384Curve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
+	p, ok := p384PointFromAffine(x1, y1)
+	if !ok {
+		return p384RandomPoint()
+	}
+	return p384PointToAffine(p.Double(p))
+}
+
+func (p384Curve) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *big.Int) {
+	p, ok := p384PointFromAffine(Bx, By)
+	if !ok {
+		return p384RandomPoint()
+	}
+	return p384PointToAffine(p.ScalarMult(p, scalar))
+}
+
+func (p384Curve) ScalarBaseMult(scalar []byte) (*big.Int, *big.Int) {
+	p := nistec.NewP384Generator()
+	return p384PointToAffine(p.ScalarMult(p, scalar))
+}
--- a/src/crypto/elliptic/p521.go
+++ b/src/crypto/elliptic/p521.go
@ -112,7 +112,7 @@ func p521RandomPoint() (x, y *big.Int) {
 	return x, y
 }

-func (curve p521Curve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
+func (p521Curve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
 	p1, ok := p521PointFromAffine(x1, y1)
 	if !ok {
 		return p521RandomPoint()
@ -124,7 +124,7 @@ func (curve p521Curve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
 	return p521PointToAffine(p1.Add(p1, p2))
 }

-func (curve p521Curve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
+func (p521Curve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
 	p, ok := p521PointFromAffine(x1, y1)
 	if !ok {
 		return p521RandomPoint()
@ -132,7 +132,7 @@ func (curve p521Curve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
 	return p521PointToAffine(p.Double(p))
 }

-func (curve p521Curve) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *big.Int) {
+func (p521Curve) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *big.Int) {
 	p, ok := p521PointFromAffine(Bx, By)
 	if !ok {
 		return p521RandomPoint()
@ -140,7 +140,7 @@ func (curve p521Curve) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *bi
 	return p521PointToAffine(p.ScalarMult(p, scalar))
 }

-func (curve p521Curve) ScalarBaseMult(scalar []byte) (*big.Int, *big.Int) {
+func (p521Curve) ScalarBaseMult(scalar []byte) (*big.Int, *big.Int) {
 	p := nistec.NewP521Generator()
 	return p521PointToAffine(p.ScalarMult(p, scalar))
 }
--- a/src/crypto/tls/common.go
+++ b/src/crypto/tls/common.go
@ -18,6 +18,7 @@ import (
 	"crypto/x509"
 	"errors"
 	"fmt"
+	"internal/godebug"
 	"io"
 	"net"
 	"strings"
@ -682,11 +683,20 @@ type Config struct {
 	ClientSessionCache ClientSessionCache

 	// MinVersion contains the minimum TLS version that is acceptable.
-	// If zero, TLS 1.0 is currently taken as the minimum.
+	//
+	// By default, TLS 1.2 is currently used as the minimum when acting as a
+	// client, and TLS 1.0 when acting as a server. TLS 1.0 is the minimum
+	// supported by this package, both as a client and as a server.
+	//
+	// The client-side default can temporarily be reverted to TLS 1.0 by
+	// including the value "x509sha1=1" in the GODEBUG environment variable.
+	// Note that this option will be removed in Go 1.19 (but it will still be
+	// possible to set this field to VersionTLS10 explicitly).
 	MinVersion uint16

 	// MaxVersion contains the maximum TLS version that is acceptable.
-	// If zero, the maximum version supported by this package is used,
+	//
+	// By default, the maximum version supported by this package is used,
 	// which is currently TLS 1.3.
 	MaxVersion uint16

@ -967,12 +977,24 @@ var supportedVersions = []uint16{
 	VersionTLS10,
 }

-func (c *Config) supportedVersions() []uint16 {
+// debugEnableTLS10 enables TLS 1.0. See issue 45428.
+var debugEnableTLS10 = godebug.Get("tls10default") == "1"
+
+// roleClient and roleServer are meant to call supportedVersions and parents
+// with more readability at the callsite.
+const roleClient = true
+const roleServer = false
+
+func (c *Config) supportedVersions(isClient bool) []uint16 {
 	versions := make([]uint16, 0, len(supportedVersions))
 	for _, v := range supportedVersions {
 		if needFIPS() && (v < fipsMinVersion(c) || v > fipsMaxVersion(c)) {
 			continue
 		}
+		if (c == nil || c.MinVersion == 0) && !debugEnableTLS10 &&
+			isClient && v < VersionTLS12 {
+			continue
+		}
 		if c != nil && c.MinVersion != 0 && v < c.MinVersion {
 			continue
 		}
@ -984,8 +1006,8 @@ func (c *Config) supportedVersions() []uint16 {
 	return versions
 }

-func (c *Config) maxSupportedVersion() uint16 {
-	supportedVersions := c.supportedVersions()
+func (c *Config) maxSupportedVersion(isClient bool) uint16 {
+	supportedVersions := c.supportedVersions(isClient)
 	if len(supportedVersions) == 0 {
 		return 0
 	}
@ -1029,8 +1051,8 @@ func (c *Config) supportsCurve(curve CurveID) bool {

 // mutualVersion returns the protocol version to use given the advertised
 // versions of the peer. Priority is given to the peer preference order.
-func (c *Config) mutualVersion(peerVersions []uint16) (uint16, bool) {
-	supportedVersions := c.supportedVersions()
+func (c *Config) mutualVersion(isClient bool, peerVersions []uint16) (uint16, bool) {
+	supportedVersions := c.supportedVersions(isClient)
 	for _, peerVersion := range peerVersions {
 		for _, v := range supportedVersions {
 			if v == peerVersion {
@ -1109,7 +1131,7 @@ func (chi *ClientHelloInfo) SupportsCertificate(c *Certificate) error {
 	if config == nil {
 		config = &Config{}
 	}
-	vers, ok := config.mutualVersion(chi.SupportedVersions)
+	vers, ok := config.mutualVersion(roleServer, chi.SupportedVersions)
 	if !ok {
 		return errors.New("no mutually supported protocol versions")
 	}
--- a/src/crypto/tls/handshake_client.go
+++ b/src/crypto/tls/handshake_client.go
@ -52,12 +52,12 @@ func (c *Conn) makeClientHello() (*clientHelloMsg, ecdheParameters, error) {
 		return nil, nil, errors.New("tls: NextProtos values too large")
 	}

-	supportedVersions := config.supportedVersions()
+	supportedVersions := config.supportedVersions(roleClient)
 	if len(supportedVersions) == 0 {
 		return nil, nil, errors.New("tls: no supported versions satisfy MinVersion and MaxVersion")
 	}

-	clientHelloVersion := config.maxSupportedVersion()
+	clientHelloVersion := config.maxSupportedVersion(roleClient)
 	// The version at the beginning of the ClientHello was capped at TLS 1.2
 	// for compatibility reasons. The supported_versions extension is used
 	// to negotiate versions now. See RFC 8446, Section 4.2.1.
@ -197,7 +197,7 @@ func (c *Conn) clientHandshake(ctx context.Context) (err error) {
 	// If we are negotiating a protocol version that's lower than what we
 	// support, check for the server downgrade canaries.
 	// See RFC 8446, Section 4.1.3.
-	maxVers := c.config.maxSupportedVersion()
+	maxVers := c.config.maxSupportedVersion(roleClient)
 	tls12Downgrade := string(serverHello.random[24:]) == downgradeCanaryTLS12
 	tls11Downgrade := string(serverHello.random[24:]) == downgradeCanaryTLS11
 	if maxVers == VersionTLS13 && c.vers <= VersionTLS12 && (tls12Downgrade || tls11Downgrade) ||
@ -365,7 +365,7 @@ func (c *Conn) pickTLSVersion(serverHello *serverHelloMsg) error {
 		peerVersion = serverHello.supportedVersion
 	}

-	vers, ok := c.config.mutualVersion([]uint16{peerVersion})
+	vers, ok := c.config.mutualVersion(roleClient, []uint16{peerVersion})
 	if !ok {
 		c.sendAlert(alertProtocolVersion)
 		return fmt.Errorf("tls: server selected unsupported protocol version %x", peerVersion)
--- a/src/crypto/tls/handshake_server.go
+++ b/src/crypto/tls/handshake_server.go
@ -156,7 +156,7 @@ func (c *Conn) readClientHello(ctx context.Context) (*clientHelloMsg, error) {
 	if len(clientHello.supportedVersions) == 0 {
 		clientVersions = supportedVersionsFromMax(clientHello.vers)
 	}
-	c.vers, ok = c.config.mutualVersion(clientVersions)
+	c.vers, ok = c.config.mutualVersion(roleServer, clientVersions)
 	if !ok {
 		c.sendAlert(alertProtocolVersion)
 		return nil, fmt.Errorf("tls: client offered only unsupported versions: %x", clientVersions)
@ -191,7 +191,7 @@ func (hs *serverHandshakeState) processClientHello() error {
 	hs.hello.random = make([]byte, 32)
 	serverRandom := hs.hello.random
 	// Downgrade protection canaries. See RFC 8446, Section 4.1.3.
-	maxVers := c.config.maxSupportedVersion()
+	maxVers := c.config.maxSupportedVersion(roleServer)
 	if maxVers >= VersionTLS12 && c.vers < maxVers || testingOnlyForceDowngradeCanary {
 		if c.vers == VersionTLS12 {
 			copy(serverRandom[24:], downgradeCanaryTLS12)
@ -354,7 +354,7 @@ func (hs *serverHandshakeState) pickCipherSuite() error {
 	for _, id := range hs.clientHello.cipherSuites {
 		if id == TLS_FALLBACK_SCSV {
 			// The client is doing a fallback connection. See RFC 7507.
-			if hs.clientHello.vers < c.config.maxSupportedVersion() {
+			if hs.clientHello.vers < c.config.maxSupportedVersion(roleServer) {
 				c.sendAlert(alertInappropriateFallback)
 				return errors.New("tls: client using inappropriate protocol fallback")
 			}
--- a/src/crypto/tls/handshake_server_test.go
+++ b/src/crypto/tls/handshake_server_test.go
@ -385,13 +385,30 @@ func TestVersion(t *testing.T) {
 	}
 	clientConfig := &Config{
 		InsecureSkipVerify: true,
+		MinVersion:         VersionTLS10,
 	}
 	state, _, err := testHandshake(t, clientConfig, serverConfig)
 	if err != nil {
 		t.Fatalf("handshake failed: %s", err)
 	}
 	if state.Version != VersionTLS11 {
-		t.Fatalf("Incorrect version %x, should be %x", state.Version, VersionTLS11)
+		t.Fatalf("incorrect version %x, should be %x", state.Version, VersionTLS11)
+	}
+
+	clientConfig.MinVersion = 0
+	_, _, err = testHandshake(t, clientConfig, serverConfig)
+	if err == nil {
+		t.Fatalf("expected failure to connect with TLS 1.0/1.1")
+	}
+
+	defer func(old bool) { debugEnableTLS10 = old }(debugEnableTLS10)
+	debugEnableTLS10 = true
+	_, _, err = testHandshake(t, clientConfig, serverConfig)
+	if err != nil {
+		t.Fatalf("handshake failed: %s", err)
+	}
+	if state.Version != VersionTLS11 {
+		t.Fatalf("incorrect version %x, should be %x", state.Version, VersionTLS11)
 	}
 }

@ -472,6 +489,7 @@ func testCrossVersionResume(t *testing.T, version uint16) {
 		InsecureSkipVerify: true,
 		ClientSessionCache: NewLRUClientSessionCache(1),
 		ServerName:         "servername",
+		MinVersion:         VersionTLS10,
 	}

 	// Establish a session at TLS 1.1.
--- a/src/crypto/tls/handshake_server_tls13.go
+++ b/src/crypto/tls/handshake_server_tls13.go
@ -114,7 +114,7 @@ func (hs *serverHandshakeStateTLS13) processClientHello() error {
 		if id == TLS_FALLBACK_SCSV {
 			// Use c.vers instead of max(supported_versions) because an attacker
 			// could defeat this by adding an arbitrary high version otherwise.
-			if c.vers < c.config.maxSupportedVersion() {
+			if c.vers < c.config.maxSupportedVersion(roleServer) {
 				c.sendAlert(alertInappropriateFallback)
 				return errors.New("tls: client using inappropriate protocol fallback")
 			}
--- a/src/crypto/tls/handshake_test.go
+++ b/src/crypto/tls/handshake_test.go
@ -363,6 +363,8 @@ func runMain(m *testing.M) int {
 		Certificates:       make([]Certificate, 2),
 		InsecureSkipVerify: true,
 		CipherSuites:       allCipherSuites(),
+		MinVersion:         VersionTLS10,
+		MaxVersion:         VersionTLS13,
 	}
 	testConfig.Certificates[0].Certificate = [][]byte{testRSACertificate}
 	testConfig.Certificates[0].PrivateKey = testRSAPrivateKey
--- a/src/crypto/x509/verify_test.go
+++ b/src/crypto/x509/verify_test.go
@ -534,6 +534,10 @@ func testVerify(t *testing.T, test verifyTest, useSystemRoots bool) {
 }

 func TestGoVerify(t *testing.T) {
+	// Temporarily enable SHA-1 verification since a number of test chains
+	// require it. TODO(filippo): regenerate test chains.
+	defer func(old bool) { debugAllowSHA1 = old }(debugAllowSHA1)
+	debugAllowSHA1 = true
 	for _, test := range verifyTests {
 		t.Run(test.name, func(t *testing.T) {
 			testVerify(t, test, false)
--- a/src/crypto/x509/x509.go
+++ b/src/crypto/x509/x509.go
@ -18,6 +18,7 @@ import (
 	"encoding/pem"
 	"errors"
 	"fmt"
+	"internal/godebug"
 	"io"
 	"math/big"
 	"net"
@ -181,15 +182,15 @@ type SignatureAlgorithm int
 const (
 	UnknownSignatureAlgorithm SignatureAlgorithm = iota

-	MD2WithRSA // Unsupported.
-	MD5WithRSA // Only supported for signing, not verification.
-	SHA1WithRSA
+	MD2WithRSA  // Unsupported.
+	MD5WithRSA  // Only supported for signing, not verification.
+	SHA1WithRSA // Only supported for signing, not verification.
 	SHA256WithRSA
 	SHA384WithRSA
 	SHA512WithRSA
 	DSAWithSHA1   // Unsupported.
 	DSAWithSHA256 // Unsupported.
-	ECDSAWithSHA1
+	ECDSAWithSHA1 // Only supported for signing, not verification.
 	ECDSAWithSHA256
 	ECDSAWithSHA384
 	ECDSAWithSHA512
@ -729,11 +730,23 @@ type Certificate struct {
 // involves algorithms that are not currently implemented.
 var ErrUnsupportedAlgorithm = errors.New("x509: cannot verify signature: algorithm unimplemented")

-// An InsecureAlgorithmError
+// debugAllowSHA1 allows SHA-1 signatures. See issue 41682.
+var debugAllowSHA1 = godebug.Get("x509sha1") == "1"
+
+// An InsecureAlgorithmError indicates that the SignatureAlgorithm used to
+// generate the signature is not secure, and the signature has been rejected.
+//
+// To temporarily restore support for SHA-1 signatures, include the value
+// "x509sha1=1" in the GODEBUG environment variable. Note that this option will
+// be removed in Go 1.19.
 type InsecureAlgorithmError SignatureAlgorithm

 func (e InsecureAlgorithmError) Error() string {
-	return fmt.Sprintf("x509: cannot verify signature: insecure algorithm %v", SignatureAlgorithm(e))
+	var override string
+	if SignatureAlgorithm(e) == SHA1WithRSA || SignatureAlgorithm(e) == ECDSAWithSHA1 {
+		override = " (temporarily override with GODEBUG=x509sha1=1)"
+	}
+	return fmt.Sprintf("x509: cannot verify signature: insecure algorithm %v", SignatureAlgorithm(e)) + override
 }

 // ConstraintViolationError results when a requested usage is not permitted by
@ -825,6 +838,11 @@ func checkSignature(algo SignatureAlgorithm, signed, signature []byte, publicKey
 		}
 	case crypto.MD5:
 		return InsecureAlgorithmError(algo)
+	case crypto.SHA1:
+		if !debugAllowSHA1 {
+			return InsecureAlgorithmError(algo)
+		}
+		fallthrough
 	default:
 		if !hashType.Available() {
 			return ErrUnsupportedAlgorithm
@ -1579,9 +1597,12 @@ func CreateCertificate(rand io.Reader, template, parent *Certificate, pub, priv
 	}

 	// Check the signature to ensure the crypto.Signer behaved correctly.
-	// We skip this check if the signature algorithm is MD5WithRSA as we
-	// only support this algorithm for signing, and not verification.
-	if sigAlg := getSignatureAlgorithmFromAI(signatureAlgorithm); sigAlg != MD5WithRSA {
+	sigAlg := getSignatureAlgorithmFromAI(signatureAlgorithm)
+	switch sigAlg {
+	case MD5WithRSA, SHA1WithRSA, ECDSAWithSHA1:
+		// We skip the check if the signature algorithm is only supported for
+		// signing, not verification.
+	default:
 		if err := checkSignature(sigAlg, c.Raw, signature, key.Public()); err != nil {
 			return nil, fmt.Errorf("x509: signature over certificate returned by signer is invalid: %w", err)
 		}
--- a/src/crypto/x509/x509_test.go
+++ b/src/crypto/x509/x509_test.go
@ -585,10 +585,10 @@ func TestCreateSelfSignedCertificate(t *testing.T) {
 		checkSig  bool
 		sigAlgo   SignatureAlgorithm
 	}{
-		{"RSA/RSA", &testPrivateKey.PublicKey, testPrivateKey, true, SHA1WithRSA},
+		{"RSA/RSA", &testPrivateKey.PublicKey, testPrivateKey, true, SHA384WithRSA},
 		{"RSA/ECDSA", &testPrivateKey.PublicKey, ecdsaPriv, false, ECDSAWithSHA384},
 		{"ECDSA/RSA", &ecdsaPriv.PublicKey, testPrivateKey, false, SHA256WithRSA},
-		{"ECDSA/ECDSA", &ecdsaPriv.PublicKey, ecdsaPriv, true, ECDSAWithSHA1},
+		{"ECDSA/ECDSA", &ecdsaPriv.PublicKey, ecdsaPriv, true, ECDSAWithSHA256},
 		{"RSAPSS/RSAPSS", &testPrivateKey.PublicKey, testPrivateKey, true, SHA256WithRSAPSS},
 		{"ECDSA/RSAPSS", &ecdsaPriv.PublicKey, testPrivateKey, false, SHA256WithRSAPSS},
 		{"RSAPSS/ECDSA", &testPrivateKey.PublicKey, ecdsaPriv, false, ECDSAWithSHA384},
@ -886,7 +886,6 @@ var ecdsaTests = []struct {
 	sigAlgo SignatureAlgorithm
 	pemCert string
 }{
-	{ECDSAWithSHA1, ecdsaSHA1CertPem},
 	{ECDSAWithSHA256, ecdsaSHA256p256CertPem},
 	{ECDSAWithSHA256, ecdsaSHA256p384CertPem},
 	{ECDSAWithSHA384, ecdsaSHA384p521CertPem},
@ -1389,10 +1388,10 @@ func TestCreateCertificateRequest(t *testing.T) {
 		priv    interface{}
 		sigAlgo SignatureAlgorithm
 	}{
-		{"RSA", testPrivateKey, SHA1WithRSA},
-		{"ECDSA-256", ecdsa256Priv, ECDSAWithSHA1},
-		{"ECDSA-384", ecdsa384Priv, ECDSAWithSHA1},
-		{"ECDSA-521", ecdsa521Priv, ECDSAWithSHA1},
+		{"RSA", testPrivateKey, SHA256WithRSA},
+		{"ECDSA-256", ecdsa256Priv, ECDSAWithSHA256},
+		{"ECDSA-384", ecdsa384Priv, ECDSAWithSHA256},
+		{"ECDSA-521", ecdsa521Priv, ECDSAWithSHA256},
 		{"Ed25519", ed25519Priv, PureEd25519},
 	}

@ -1783,6 +1782,9 @@ func TestInsecureAlgorithmErrorString(t *testing.T) {
 		sa   SignatureAlgorithm
 		want string
 	}{
+		{MD5WithRSA, "x509: cannot verify signature: insecure algorithm MD5-RSA"},
+		{SHA1WithRSA, "x509: cannot verify signature: insecure algorithm SHA1-RSA (temporarily override with GODEBUG=x509sha1=1)"},
+		{ECDSAWithSHA1, "x509: cannot verify signature: insecure algorithm ECDSA-SHA1 (temporarily override with GODEBUG=x509sha1=1)"},
 		{MD2WithRSA, "x509: cannot verify signature: insecure algorithm MD2-RSA"},
 		{-1, "x509: cannot verify signature: insecure algorithm -1"},
 		{0, "x509: cannot verify signature: insecure algorithm 0"},
@ -1846,6 +1848,30 @@ func TestMD5(t *testing.T) {
 	}
 }

+func TestSHA1(t *testing.T) {
+	pemBlock, _ := pem.Decode([]byte(ecdsaSHA1CertPem))
+	cert, err := ParseCertificate(pemBlock.Bytes)
+	if err != nil {
+		t.Fatalf("failed to parse certificate: %s", err)
+	}
+	if sa := cert.SignatureAlgorithm; sa != ECDSAWithSHA1 {
+		t.Errorf("signature algorithm is %v, want %v", sa, ECDSAWithSHA1)
+	}
+	if err = cert.CheckSignatureFrom(cert); err == nil {
+		t.Fatalf("certificate verification succeeded incorrectly")
+	}
+	if _, ok := err.(InsecureAlgorithmError); !ok {
+		t.Fatalf("certificate verification returned %v (%T), wanted InsecureAlgorithmError", err, err)
+	}
+
+	defer func(old bool) { debugAllowSHA1 = old }(debugAllowSHA1)
+	debugAllowSHA1 = true
+
+	if err = cert.CheckSignatureFrom(cert); err != nil {
+		t.Fatalf("SHA-1 certificate did not verify with GODEBUG=x509sha1=1: %v", err)
+	}
+}
+
 // certMissingRSANULL contains an RSA public key where the AlgorithmIdentifier
 // parameters are omitted rather than being an ASN.1 NULL.
 const certMissingRSANULL = `
@ -2897,19 +2923,31 @@ func TestCreateCertificateBrokenSigner(t *testing.T) {
 	}
 }

-func TestCreateCertificateMD5(t *testing.T) {
-	template := &Certificate{
-		SerialNumber:       big.NewInt(10),
-		DNSNames:           []string{"example.com"},
-		SignatureAlgorithm: MD5WithRSA,
-	}
-	k, err := rsa.GenerateKey(rand.Reader, 1024)
+func TestCreateCertificateLegacy(t *testing.T) {
+	ecdsaPriv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
 	if err != nil {
-		t.Fatalf("failed to generate test key: %s", err)
+		t.Fatalf("Failed to generate ECDSA key: %s", err)
 	}
-	_, err = CreateCertificate(rand.Reader, template, template, k.Public(), &brokenSigner{k.Public()})
-	if err != nil {
-		t.Fatalf("CreateCertificate failed when SignatureAlgorithm = MD5WithRSA: %s", err)
+
+	for _, sigAlg := range []SignatureAlgorithm{
+		MD5WithRSA, SHA1WithRSA, ECDSAWithSHA1,
+	} {
+		template := &Certificate{
+			SerialNumber:       big.NewInt(10),
+			DNSNames:           []string{"example.com"},
+			SignatureAlgorithm: sigAlg,
+		}
+		var k crypto.Signer
+		switch sigAlg {
+		case MD5WithRSA, SHA1WithRSA:
+			k = testPrivateKey
+		case ECDSAWithSHA1:
+			k = ecdsaPriv
+		}
+		_, err := CreateCertificate(rand.Reader, template, template, k.Public(), &brokenSigner{k.Public()})
+		if err != nil {
+			t.Fatalf("CreateCertificate failed when SignatureAlgorithm = %v: %s", sigAlg, err)
+		}
 	}
 }

@ -3131,7 +3169,6 @@ func TestParseCertificateRawEquals(t *testing.T) {
 	if !bytes.Equal(p.Bytes, cert.Raw) {
 		t.Fatalf("unexpected Certificate.Raw\ngot: %x\nwant: %x\n", cert.Raw, p.Bytes)
 	}
-	fmt.Printf("in:  %x\nout: %x\n", p.Bytes, cert.Raw)
 }

 // mismatchingSigAlgIDPEM contains a certificate where the Certificate
--- a/src/debug/elf/elf.go
+++ b/src/debug/elf/elf.go
@ -2349,6 +2349,7 @@ const (
 	R_PPC64_GOT16_HI           R_PPC64 = 16 // R_POWERPC_GOT16_HI
 	R_PPC64_GOT16_HA           R_PPC64 = 17 // R_POWERPC_GOT16_HA
 	R_PPC64_JMP_SLOT           R_PPC64 = 21 // R_POWERPC_JMP_SLOT
+	R_PPC64_RELATIVE           R_PPC64 = 22 // R_POWERPC_RELATIVE
 	R_PPC64_REL32              R_PPC64 = 26 // R_POWERPC_REL32
 	R_PPC64_ADDR64             R_PPC64 = 38
 	R_PPC64_ADDR16_HIGHER      R_PPC64 = 39
@ -2457,6 +2458,7 @@ var rppc64Strings = []intName{
 	{16, "R_PPC64_GOT16_HI"},
 	{17, "R_PPC64_GOT16_HA"},
 	{21, "R_PPC64_JMP_SLOT"},
+	{22, "R_PPC64_RELATIVE"},
 	{26, "R_PPC64_REL32"},
 	{38, "R_PPC64_ADDR64"},
 	{39, "R_PPC64_ADDR16_HIGHER"},
--- a/src/debug/plan9obj/file.go
+++ b/src/debug/plan9obj/file.go
@ -301,11 +301,15 @@ func newTable(symtab []byte, ptrsz int) ([]Sym, error) {
 	return syms, nil
 }

+// ErrNoSymbols is returned by File.Symbols if there is no such section
+// in the File.
+var ErrNoSymbols = errors.New("no symbol section")
+
 // Symbols returns the symbol table for f.
 func (f *File) Symbols() ([]Sym, error) {
 	symtabSection := f.Section("syms")
 	if symtabSection == nil {
-		return nil, errors.New("no symbol section")
+		return nil, ErrNoSymbols
 	}

 	symtab, err := symtabSection.Data()
--- a/src/errors/wrap_test.go
+++ b/src/errors/wrap_test.go
@ -265,3 +265,13 @@ func ExampleAs() {
 	// Output:
 	// Failed at path: non-existing
 }
+
+func ExampleUnwrap() {
+	err1 := errors.New("error1")
+	err2 := fmt.Errorf("error2: [%w]", err1)
+	fmt.Println(err2)
+	fmt.Println(errors.Unwrap(err2))
+	// Output
+	// error2: [error1]
+	// error1
+}
--- a/src/internal/poll/fd_windows.go
+++ b/src/internal/poll/fd_windows.go
@ -593,7 +593,7 @@ func (fd *FD) ReadFrom(buf []byte) (int, syscall.Sockaddr, error) {
 	return n, sa, nil
 }

-// ReadFrom wraps the recvfrom network call for IPv4.
+// ReadFromInet4 wraps the recvfrom network call for IPv4.
 func (fd *FD) ReadFromInet4(buf []byte, sa4 *syscall.SockaddrInet4) (int, error) {
 	if len(buf) == 0 {
 		return 0, nil
@ -622,7 +622,7 @@ func (fd *FD) ReadFromInet4(buf []byte, sa4 *syscall.SockaddrInet4) (int, error)
 	return n, err
 }

-// ReadFrom wraps the recvfrom network call for IPv6.
+// ReadFromInet6 wraps the recvfrom network call for IPv6.
 func (fd *FD) ReadFromInet6(buf []byte, sa6 *syscall.SockaddrInet6) (int, error) {
 	if len(buf) == 0 {
 		return 0, nil
--- a/src/net/http/export_test.go
+++ b/src/net/http/export_test.go
@ -88,12 +88,7 @@ func SetPendingDialHooks(before, after func()) {

 func SetTestHookServerServe(fn func(*Server, net.Listener)) { testHookServerServe = fn }

-func NewTestTimeoutHandler(handler Handler, ch <-chan time.Time) Handler {
-	ctx, cancel := context.WithCancel(context.Background())
-	go func() {
-		<-ch
-		cancel()
-	}()
+func NewTestTimeoutHandler(handler Handler, ctx context.Context) Handler {
 	return &timeoutHandler{
 		handler:     handler,
 		testContext: ctx,
--- a/src/net/http/main_test.go
+++ b/src/net/http/main_test.go
@ -43,7 +43,7 @@ func interestingGoroutines() (gs []string) {
 			// These only show up with GOTRACEBACK=2; Issue 5005 (comment 28)
 			strings.Contains(stack, "runtime.goexit") ||
 			strings.Contains(stack, "created by runtime.gc") ||
-			strings.Contains(stack, "net/http_test.interestingGoroutines") ||
+			strings.Contains(stack, "interestingGoroutines") ||
 			strings.Contains(stack, "runtime.MHeap_Scavenger") {
 			continue
 		}
--- a/src/net/http/serve_test.go
+++ b/src/net/http/serve_test.go
@ -2274,6 +2274,18 @@ func TestRequestBodyTimeoutClosesConnection(t *testing.T) {
 	}
 }

+// cancelableTimeoutContext overwrites the error message to DeadlineExceeded
+type cancelableTimeoutContext struct {
+	context.Context
+}
+
+func (c cancelableTimeoutContext) Err() error {
+	if c.Context.Err() != nil {
+		return context.DeadlineExceeded
+	}
+	return nil
+}
+
 func TestTimeoutHandler_h1(t *testing.T) { testTimeoutHandler(t, h1Mode) }
 func TestTimeoutHandler_h2(t *testing.T) { testTimeoutHandler(t, h2Mode) }
 func testTimeoutHandler(t *testing.T, h2 bool) {
@ -2286,8 +2298,9 @@ func testTimeoutHandler(t *testing.T, h2 bool) {
 		_, werr := w.Write([]byte("hi"))
 		writeErrors <- werr
 	})
-	timeout := make(chan time.Time, 1) // write to this to force timeouts
-	cst := newClientServerTest(t, h2, NewTestTimeoutHandler(sayHi, timeout))
+	ctx, cancel := context.WithCancel(context.Background())
+	h := NewTestTimeoutHandler(sayHi, cancelableTimeoutContext{ctx})
+	cst := newClientServerTest(t, h2, h)
 	defer cst.close()

 	// Succeed without timing out:
@ -2308,7 +2321,8 @@ func testTimeoutHandler(t *testing.T, h2 bool) {
 	}

 	// Times out:
-	timeout <- time.Time{}
+	cancel()
+
 	res, err = cst.c.Get(cst.ts.URL)
 	if err != nil {
 		t.Error(err)
@ -2429,8 +2443,9 @@ func TestTimeoutHandlerRaceHeaderTimeout(t *testing.T) {
 		_, werr := w.Write([]byte("hi"))
 		writeErrors <- werr
 	})
-	timeout := make(chan time.Time, 1) // write to this to force timeouts
-	cst := newClientServerTest(t, h1Mode, NewTestTimeoutHandler(sayHi, timeout))
+	ctx, cancel := context.WithCancel(context.Background())
+	h := NewTestTimeoutHandler(sayHi, cancelableTimeoutContext{ctx})
+	cst := newClientServerTest(t, h1Mode, h)
 	defer cst.close()

 	// Succeed without timing out:
@ -2451,7 +2466,8 @@ func TestTimeoutHandlerRaceHeaderTimeout(t *testing.T) {
 	}

 	// Times out:
-	timeout <- time.Time{}
+	cancel()
+
 	res, err = cst.c.Get(cst.ts.URL)
 	if err != nil {
 		t.Error(err)
@ -2501,6 +2517,41 @@ func TestTimeoutHandlerStartTimerWhenServing(t *testing.T) {
 	}
 }

+func TestTimeoutHandlerContextCanceled(t *testing.T) {
+	setParallel(t)
+	defer afterTest(t)
+	sendHi := make(chan bool, 1)
+	writeErrors := make(chan error, 1)
+	sayHi := HandlerFunc(func(w ResponseWriter, r *Request) {
+		w.Header().Set("Content-Type", "text/plain")
+		<-sendHi
+		_, werr := w.Write([]byte("hi"))
+		writeErrors <- werr
+	})
+	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Hour)
+	h := NewTestTimeoutHandler(sayHi, ctx)
+	cancel()
+	cst := newClientServerTest(t, h1Mode, h)
+	defer cst.close()
+
+	// Succeed without timing out:
+	sendHi <- true
+	res, err := cst.c.Get(cst.ts.URL)
+	if err != nil {
+		t.Error(err)
+	}
+	if g, e := res.StatusCode, StatusServiceUnavailable; g != e {
+		t.Errorf("got res.StatusCode %d; expected %d", g, e)
+	}
+	body, _ := io.ReadAll(res.Body)
+	if g, e := string(body), ""; g != e {
+		t.Errorf("got body %q; expected %q", g, e)
+	}
+	if g, e := <-writeErrors, context.Canceled; g != e {
+		t.Errorf("got unexpected Write error on first request: %v", g)
+	}
+}
+
 // https://golang.org/issue/15948
 func TestTimeoutHandlerEmptyResponse(t *testing.T) {
 	setParallel(t)
--- a/src/net/http/server.go
+++ b/src/net/http/server.go
@ -3391,9 +3391,15 @@ func (h *timeoutHandler) ServeHTTP(w ResponseWriter, r *Request) {
 	case <-ctx.Done():
 		tw.mu.Lock()
 		defer tw.mu.Unlock()
-		w.WriteHeader(StatusServiceUnavailable)
-		io.WriteString(w, h.errorBody())
-		tw.timedOut = true
+		switch err := ctx.Err(); err {
+		case context.DeadlineExceeded:
+			w.WriteHeader(StatusServiceUnavailable)
+			io.WriteString(w, h.errorBody())
+			tw.err = ErrHandlerTimeout
+		default:
+			w.WriteHeader(StatusServiceUnavailable)
+			tw.err = err
+		}
 	}
 }

@ -3404,7 +3410,7 @@ type timeoutWriter struct {
 	req  *Request

 	mu          sync.Mutex
-	timedOut    bool
+	err         error
 	wroteHeader bool
 	code        int
 }
@ -3424,8 +3430,8 @@ func (tw *timeoutWriter) Header() Header { return tw.h }
 func (tw *timeoutWriter) Write(p []byte) (int, error) {
 	tw.mu.Lock()
 	defer tw.mu.Unlock()
-	if tw.timedOut {
-		return 0, ErrHandlerTimeout
+	if tw.err != nil {
+		return 0, tw.err
 	}
 	if !tw.wroteHeader {
 		tw.writeHeaderLocked(StatusOK)
@ -3437,7 +3443,7 @@ func (tw *timeoutWriter) writeHeaderLocked(code int) {
 	checkWriteHeaderCode(code)

 	switch {
-	case tw.timedOut:
+	case tw.err != nil:
 		return
 	case tw.wroteHeader:
 		if tw.req != nil {
--- a/src/net/http/transport.go
+++ b/src/net/http/transport.go
@ -2481,7 +2481,7 @@ type requestAndChan struct {
 	callerGone <-chan struct{} // closed when roundTrip caller has returned
 }

-// A writeRequest is sent by the readLoop's goroutine to the
+// A writeRequest is sent by the caller's goroutine to the
 // writeLoop's goroutine to write a request while the read loop
 // concurrently waits on both the write response and the server's
 // reply.
--- a/src/net/netip/netip.go
+++ b/src/net/netip/netip.go
@ -155,9 +155,14 @@ func (err parseAddrError) Error() string {
 func parseIPv4(s string) (ip Addr, err error) {
 	var fields [4]uint8
 	var val, pos int
+	var digLen int // number of digits in current octet
 	for i := 0; i < len(s); i++ {
 		if s[i] >= '0' && s[i] <= '9' {
+			if digLen == 1 && val == 0 {
+				return Addr{}, parseAddrError{in: s, msg: "IPv4 field has octet with leading zero"}
+			}
 			val = val*10 + int(s[i]) - '0'
+			digLen++
 			if val > 255 {
 				return Addr{}, parseAddrError{in: s, msg: "IPv4 field has value >255"}
 			}
@ -175,6 +180,7 @@ func parseIPv4(s string) (ip Addr, err error) {
 			fields[pos] = uint8(val)
 			pos++
 			val = 0
+			digLen = 0
 		} else {
 			return Addr{}, parseAddrError{in: s, msg: "unexpected character", at: s[i:]}
 		}
@ -692,21 +698,19 @@ const (
 // IPv6 addresses with zones are returned without their zone (use the
 // Zone method to get it).
 // The ip zero value returns all zeroes.
-func (ip Addr) As16() [16]byte {
-	var ret [16]byte
-	bePutUint64(ret[:8], ip.addr.hi)
-	bePutUint64(ret[8:], ip.addr.lo)
-	return ret
+func (ip Addr) As16() (a16 [16]byte) {
+	bePutUint64(a16[:8], ip.addr.hi)
+	bePutUint64(a16[8:], ip.addr.lo)
+	return a16
 }

 // As4 returns an IPv4 or IPv4-in-IPv6 address in its 4-byte representation.
 // If ip is the zero Addr or an IPv6 address, As4 panics.
 // Note that 0.0.0.0 is not the zero Addr.
-func (ip Addr) As4() [4]byte {
+func (ip Addr) As4() (a4 [4]byte) {
 	if ip.z == z4 || ip.Is4In6() {
-		var ret [4]byte
-		bePutUint32(ret[:], uint32(ip.addr.lo))
-		return ret
+		bePutUint32(a4[:], uint32(ip.addr.lo))
+		return a4
 	}
 	if ip.z == z0 {
 		panic("As4 called on IP zero value")
--- a/src/net/netip/netip_test.go
+++ b/src/net/netip/netip_test.go
@ -29,9 +29,10 @@ var (

 func TestParseAddr(t *testing.T) {
 	var validIPs = []struct {
-		in  string
-		ip  Addr   // output of ParseAddr()
-		str string // output of String(). If "", use in.
+		in      string
+		ip      Addr   // output of ParseAddr()
+		str     string // output of String(). If "", use in.
+		wantErr string
 	}{
 		// Basic zero IPv4 address.
 		{
@ -45,15 +46,18 @@ func TestParseAddr(t *testing.T) {
 		},
 		// IPv4 address in windows-style "print all the digits" form.
 		{
-			in:  "010.000.015.001",
-			ip:  MkAddr(Mk128(0, 0xffff0a000f01), Z4),
-			str: "10.0.15.1",
+			in:      "010.000.015.001",
+			wantErr: `ParseAddr("010.000.015.001"): IPv4 field has octet with leading zero`,
 		},
 		// IPv4 address with a silly amount of leading zeros.
 		{
-			in:  "000001.00000002.00000003.000000004",
-			ip:  MkAddr(Mk128(0, 0xffff01020304), Z4),
-			str: "1.2.3.4",
+			in:      "000001.00000002.00000003.000000004",
+			wantErr: `ParseAddr("000001.00000002.00000003.000000004"): IPv4 field has octet with leading zero`,
+		},
+		// 4-in-6 with octet with leading zero
+		{
+			in:      "::ffff:1.2.03.4",
+			wantErr: `ParseAddr("::ffff:1.2.03.4"): ParseAddr("1.2.03.4"): IPv4 field has octet with leading zero (at "1.2.03.4")`,
 		},
 		// Basic zero IPv6 address.
 		{
@ -121,10 +125,16 @@ func TestParseAddr(t *testing.T) {
 		t.Run(test.in, func(t *testing.T) {
 			got, err := ParseAddr(test.in)
 			if err != nil {
+				if err.Error() == test.wantErr {
+					return
+				}
 				t.Fatal(err)
 			}
+			if test.wantErr != "" {
+				t.Fatalf("wanted error %q; got none", test.wantErr)
+			}
 			if got != test.ip {
-				t.Errorf("ParseAddr(%q) got %#v, want %#v", test.in, got, test.ip)
+				t.Errorf("got %#v, want %#v", got, test.ip)
 			}

 			// Check that ParseAddr is a pure function.
@ -963,7 +973,7 @@ func TestIs4In6(t *testing.T) {
 		{mustIP("::ffff:192.0.2.128"), true, mustIP("192.0.2.128")},
 		{mustIP("::ffff:192.0.2.128%eth0"), true, mustIP("192.0.2.128")},
 		{mustIP("::fffe:c000:0280"), false, mustIP("::fffe:c000:0280")},
-		{mustIP("::ffff:127.001.002.003"), true, mustIP("127.1.2.3")},
+		{mustIP("::ffff:127.1.2.3"), true, mustIP("127.1.2.3")},
 		{mustIP("::ffff:7f01:0203"), true, mustIP("127.1.2.3")},
 		{mustIP("0:0:0:0:0000:ffff:127.1.2.3"), true, mustIP("127.1.2.3")},
 		{mustIP("0:0:0:0:000000:ffff:127.1.2.3"), true, mustIP("127.1.2.3")},
@ -1796,3 +1806,12 @@ func TestInvalidAddrPortString(t *testing.T) {
 		}
 	}
 }
+
+var sink16 [16]byte
+
+func BenchmarkAs16(b *testing.B) {
+	addr := MustParseAddr("1::10")
+	for i := 0; i < b.N; i++ {
+		sink16 = addr.As16()
+	}
+}
--- a/src/net/udpsock_test.go
+++ b/src/net/udpsock_test.go
@ -475,6 +475,17 @@ func TestUDPReadTimeout(t *testing.T) {
 }

 func TestAllocs(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9":
+		// Plan9 wasn't optimized.
+		t.Skipf("skipping on %v", runtime.GOOS)
+	}
+	builder := os.Getenv("GO_BUILDER_NAME")
+	switch builder {
+	case "linux-amd64-noopt":
+		// Optimizations are required to remove the allocs.
+		t.Skipf("skipping on %v", builder)
+	}
 	conn, err := ListenUDP("udp4", &UDPAddr{IP: IPv4(127, 0, 0, 1)})
 	if err != nil {
 		t.Fatal(err)
--- a/src/runtime/chan_test.go
+++ b/src/runtime/chan_test.go
@ -624,6 +624,10 @@ func TestShrinkStackDuringBlockedSend(t *testing.T) {
 }

 func TestNoShrinkStackWhileParking(t *testing.T) {
+	if runtime.GOOS == "netbsd" && runtime.GOARCH == "arm64" {
+		testenv.SkipFlaky(t, 49382)
+	}
+
 	// The goal of this test is to trigger a "racy sudog adjustment"
 	// throw. Basically, there's a window between when a goroutine
 	// becomes available for preemption for stack scanning (and thus,
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@ -796,21 +796,17 @@ func (p *PageAlloc) Free(base, npages uintptr) {
 		// None of the tests need any higher-level locking, so we just
 		// take the lock internally.
 		lock(pp.mheapLock)
-		pp.free(base, npages)
+		pp.free(base, npages, true)
 		unlock(pp.mheapLock)
 	})
 }
 func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 	return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
 }
-func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
+func (p *PageAlloc) Scavenge(nbytes uintptr) (r uintptr) {
 	pp := (*pageAlloc)(p)
 	systemstack(func() {
-		// None of the tests need any higher-level locking, so we just
-		// take the lock internally.
-		lock(pp.mheapLock)
-		r = pp.scavenge(nbytes, mayUnlock)
-		unlock(pp.mheapLock)
+		r = pp.scavenge(nbytes)
 	})
 	return
 }
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@ -78,6 +78,11 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	If the line ends with "(forced)", this GC was forced by a
 	runtime.GC() call.

+	harddecommit: setting harddecommit=1 causes memory that is returned to the OS to
+	also have protections removed on it. This is the only mode of operation on Windows,
+	but is helpful in debugging scavenger-related issues on other platforms. Currently,
+	only supported on Linux.
+
 	inittrace: setting inittrace=1 causes the runtime to emit a single line to standard
 	error for each package with init work, summarizing the execution time and memory
 	allocation. No information is printed for inits executed as part of plugin loading
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@ -51,9 +51,9 @@ const (
 	lockRankItab
 	lockRankReflectOffs
 	lockRankHchan // Multiple hchans acquired in lock order in syncadjustsudogs()
+	lockRankTraceBuf
 	lockRankFin
 	lockRankNotifyList
-	lockRankTraceBuf
 	lockRankTraceStrings
 	lockRankMspanSpecial
 	lockRankProf
@ -80,6 +80,7 @@ const (

 	// Memory-related leaf locks
 	lockRankGlobalAlloc
+	lockRankPageAllocScav

 	// Other leaf locks
 	lockRankGFree
@ -131,9 +132,9 @@ var lockNames = []string{
 	lockRankReflectOffs: "reflectOffs",

 	lockRankHchan:         "hchan",
+	lockRankTraceBuf:      "traceBuf",
 	lockRankFin:           "fin",
 	lockRankNotifyList:    "notifyList",
-	lockRankTraceBuf:      "traceBuf",
 	lockRankTraceStrings:  "traceStrings",
 	lockRankMspanSpecial:  "mspanSpecial",
 	lockRankProf:          "prof",
@ -157,7 +158,8 @@ var lockNames = []string{
 	lockRankMheap:        "mheap",
 	lockRankMheapSpecial: "mheapSpecial",

-	lockRankGlobalAlloc: "globalAlloc.mutex",
+	lockRankGlobalAlloc:   "globalAlloc.mutex",
+	lockRankPageAllocScav: "pageAlloc.scav.lock",

 	lockRankGFree:     "gFree",
 	lockRankHchanLeaf: "hchanLeaf",
@ -208,31 +210,32 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
 	lockRankHchan:         {lockRankScavenge, lockRankSweep, lockRankHchan},
-	lockRankFin:           {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan},
-	lockRankNotifyList:    {},
 	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
+	lockRankFin:           {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllg, lockRankTimers, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf},
+	lockRankNotifyList:    {},
 	lockRankTraceStrings:  {lockRankTraceBuf},
-	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
-	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
-	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
 	lockRankRoot:          {},
 	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot},
-	lockRankTraceStackTab: {lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankTrace},
+	lockRankTraceStackTab: {lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankRoot, lockRankTrace},
 	lockRankNetpollInit:   {lockRankTimers},

 	lockRankRwmutexW: {},
 	lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},

-	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
-	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
-	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
-	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
-	lockRankDefer:        {},
-	lockRankSudog:        {lockRankHchan, lockRankNotifyList},
-	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
-	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
-	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+	lockRankSpanSetSpine:  {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankGscan:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
+	lockRankStackpool:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
+	lockRankStackLarge:    {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
+	lockRankDefer:         {},
+	lockRankSudog:         {lockRankHchan, lockRankNotifyList},
+	lockRankWbufSpans:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+	lockRankMheap:         {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
+	lockRankMheapSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankGlobalAlloc:   {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+	lockRankPageAllocScav: {lockRankMheap},

 	lockRankGFree:     {lockRankSched},
 	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@ -114,9 +114,29 @@ func sysUnused(v unsafe.Pointer, n uintptr) {
 		atomic.Store(&adviseUnused, _MADV_DONTNEED)
 		madvise(v, n, _MADV_DONTNEED)
 	}
+
+	if debug.harddecommit > 0 {
+		p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+		if p != v || err != 0 {
+			throw("runtime: cannot disable permissions in address space")
+		}
+	}
 }

 func sysUsed(v unsafe.Pointer, n uintptr) {
+	if debug.harddecommit > 0 {
+		p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+		if err == _ENOMEM {
+			throw("runtime: out of memory")
+		}
+		if p != v || err != 0 {
+			throw("runtime: cannot remap pages in address space")
+		}
+		return
+
+		// Don't do the sysHugePage optimization in hard decommit mode.
+		// We're breaking up pages everywhere, there's no point.
+	}
 	// Partially undo the NOHUGEPAGE marks from sysUnused
 	// for whole huge pages between v and v+n. This may
 	// leave huge pages off at the end points v and v+n
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@ -349,9 +349,6 @@ func (c *gcControllerState) init(gcPercent int32) {
 			kp: 0.9,
 			ti: 4.0,

-			// An update is done once per GC cycle.
-			period: 1,
-
 			// Set a high reset time in GC cycles.
 			// This is inversely proportional to the rate at which we
 			// accumulate error from clipping. By making this very high
@ -677,8 +674,9 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) floa
 			(float64(scanWork) * (1 - utilization))

 		// Update cons/mark controller.
+		// Period for this is 1 GC cycle.
 		oldConsMark := c.consMark
-		c.consMark = c.consMarkController.next(c.consMark, currentConsMark)
+		c.consMark = c.consMarkController.next(c.consMark, currentConsMark, 1.0)

 		if debug.gcpacertrace > 0 {
 			printlock()
@ -1259,10 +1257,7 @@ func readGOGC() int32 {
 type piController struct {
 	kp float64 // Proportional constant.
 	ti float64 // Integral time constant.
-	tt float64 // Reset time in GC cyles.
-
-	// Period in GC cycles between updates.
-	period float64
+	tt float64 // Reset time.

 	min, max float64 // Output boundaries.

@ -1271,7 +1266,7 @@ type piController struct {
 	errIntegral float64 // Integral of the error from t=0 to now.
 }

-func (c *piController) next(input, setpoint float64) float64 {
+func (c *piController) next(input, setpoint, period float64) float64 {
 	// Compute the raw output value.
 	prop := c.kp * (setpoint - input)
 	rawOutput := prop + c.errIntegral
@ -1286,7 +1281,7 @@ func (c *piController) next(input, setpoint float64) float64 {

 	// Update the controller's state.
 	if c.ti != 0 && c.tt != 0 {
-		c.errIntegral += (c.kp*c.period/c.ti)*(setpoint-input) + (c.period/c.tt)*(output-rawOutput)
+		c.errIntegral += (c.kp*period/c.ti)*(setpoint-input) + (period/c.tt)*(output-rawOutput)
 	}
 	return output
 }
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@ -270,49 +270,80 @@ func bgscavenge(c chan int) {
 	c <- 1
 	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)

-	// Exponentially-weighted moving average of the fraction of time this
-	// goroutine spends scavenging (that is, percent of a single CPU).
-	// It represents a measure of scheduling overheads which might extend
-	// the sleep or the critical time beyond what's expected. Assume no
-	// overhead to begin with.
-	//
-	// TODO(mknyszek): Consider making this based on total CPU time of the
-	// application (i.e. scavengePercent * GOMAXPROCS). This isn't really
-	// feasible now because the scavenger acquires the heap lock over the
-	// scavenging operation, which means scavenging effectively blocks
-	// allocators and isn't scalable. However, given a scalable allocator,
-	// it makes sense to also make the scavenger scale with it; if you're
-	// allocating more frequently, then presumably you're also generating
-	// more work for the scavenger.
-	const idealFraction = scavengePercent / 100.0
-	scavengeEWMA := float64(idealFraction)
+	// idealFraction is the ideal % of overall application CPU time that we
+	// spend scavenging.
+	idealFraction := float64(scavengePercent) / 100.0

+	// Input: fraction of CPU time used.
+	// Setpoint: idealFraction.
+	// Output: ratio of critical time to sleep time (determines sleep time).
+	//
+	// The output of this controller is somewhat indirect to what we actually
+	// want to achieve: how much time to sleep for. The reason for this definition
+	// is to ensure that the controller's outputs have a direct relationship with
+	// its inputs (as opposed to an inverse relationship), making it somewhat
+	// easier to reason about for tuning purposes.
+	critSleepController := piController{
+		// Tuned loosely via Ziegler-Nichols process.
+		kp: 0.3375,
+		ti: 3.2e6,
+		tt: 1e9, // 1 second reset time.
+
+		// These ranges seem wide, but we want to give the controller plenty of
+		// room to hunt for the optimal value.
+		min: 0.001,  // 1:1000
+		max: 1000.0, // 1000:1
+	}
+	// It doesn't really matter what value we start at, but we can't be zero, because
+	// that'll cause divide-by-zero issues.
+	critSleepRatio := 0.001
 	for {
 		released := uintptr(0)
-
-		// Time in scavenging critical section.
 		crit := float64(0)

-		// Run on the system stack since we grab the heap lock,
-		// and a stack growth with the heap lock means a deadlock.
-		systemstack(func() {
-			lock(&mheap_.lock)
-
+		// Spend at least 1 ms scavenging, otherwise the corresponding
+		// sleep time to maintain our desired utilization is too low to
+		// be reliable.
+		const minCritTime = 1e6
+		for crit < minCritTime {
 			// If background scavenging is disabled or if there's no work to do just park.
 			retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
 			if retained <= goal {
-				unlock(&mheap_.lock)
-				return
+				break
 			}

-			// Scavenge one page, and measure the amount of time spent scavenging.
-			start := nanotime()
-			released = mheap_.pages.scavenge(physPageSize, true)
-			mheap_.pages.scav.released += released
-			crit = float64(nanotime() - start)
+			// scavengeQuantum is the amount of memory we try to scavenge
+			// in one go. A smaller value means the scavenger is more responsive
+			// to the scheduler in case of e.g. preemption. A larger value means
+			// that the overheads of scavenging are better amortized, so better
+			// scavenging throughput.
+			//
+			// The current value is chosen assuming a cost of ~10µs/physical page
+			// (this is somewhat pessimistic), which implies a worst-case latency of
+			// about 160µs for 4 KiB physical pages. The current value is biased
+			// toward latency over throughput.
+			const scavengeQuantum = 64 << 10

-			unlock(&mheap_.lock)
-		})
+			// Accumulate the amount of time spent scavenging.
+			start := nanotime()
+			released = mheap_.pages.scavenge(scavengeQuantum)
+			atomic.Xadduintptr(&mheap_.pages.scav.released, released)
+			end := nanotime()
+
+			// On some platforms we may see end >= start if the time it takes to scavenge
+			// memory is less than the minimum granularity of its clock (e.g. Windows) or
+			// due to clock bugs.
+			//
+			// In this case, just assume scavenging takes 10 µs per regular physical page
+			// (determined empirically), and conservatively ignore the impact of huge pages
+			// on timing.
+			const approxCritNSPerPhysicalPage = 10e3
+			if end <= start {
+				crit += approxCritNSPerPhysicalPage * float64(released/physPageSize)
+			} else {
+				crit += float64(end - start)
+			}
+		}

 		if released == 0 {
 			lock(&scavenge.lock)
@ -329,18 +360,13 @@ func bgscavenge(c chan int) {
 			throw("released less than one physical page of memory")
 		}

-		// On some platforms we may see crit as zero if the time it takes to scavenge
-		// memory is less than the minimum granularity of its clock (e.g. Windows).
-		// In this case, just assume scavenging takes 10 µs per regular physical page
-		// (determined empirically), and conservatively ignore the impact of huge pages
-		// on timing.
-		//
-		// We shouldn't ever see a crit value less than zero unless there's a bug of
-		// some kind, either on our side or in the platform we're running on, but be
-		// defensive in that case as well.
-		const approxCritNSPerPhysicalPage = 10e3
-		if crit <= 0 {
-			crit = approxCritNSPerPhysicalPage * float64(released/physPageSize)
+		if crit < minCritTime {
+			// This means there wasn't enough work to actually fill up minCritTime.
+			// That's fine; we shouldn't try to do anything with this information
+			// because it's going result in a short enough sleep request that things
+			// will get messy. Just assume we did at least this much work.
+			// All this means is that we'll sleep longer than we otherwise would have.
+			crit = minCritTime
 		}

 		// Multiply the critical time by 1 + the ratio of the costs of using
@ -351,41 +377,19 @@ func bgscavenge(c chan int) {
 		// because of the additional overheads of using scavenged memory.
 		crit *= 1 + scavengeCostRatio

-		// If we spent more than 10 ms (for example, if the OS scheduled us away, or someone
-		// put their machine to sleep) in the critical section, bound the time we use to
-		// calculate at 10 ms to avoid letting the sleep time get arbitrarily high.
-		const maxCrit = 10e6
-		if crit > maxCrit {
-			crit = maxCrit
-		}
+		// Go to sleep for our current sleepNS.
+		slept := scavengeSleep(int64(crit / critSleepRatio))

-		// Compute the amount of time to sleep, assuming we want to use at most
-		// scavengePercent of CPU time. Take into account scheduling overheads
-		// that may extend the length of our sleep by multiplying by how far
-		// off we are from the ideal ratio. For example, if we're sleeping too
-		// much, then scavengeEMWA < idealFraction, so we'll adjust the sleep time
-		// down.
-		adjust := scavengeEWMA / idealFraction
-		sleepTime := int64(adjust * crit / (scavengePercent / 100.0))
+		// Calculate the CPU time spent.
+		//
+		// This may be slightly inaccurate with respect to GOMAXPROCS, but we're
+		// recomputing this often enough relative to GOMAXPROCS changes in general
+		// (it only changes when the world is stopped, and not during a GC) that
+		// that small inaccuracy is in the noise.
+		cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs))

-		// Go to sleep.
-		slept := scavengeSleep(sleepTime)
-
-		// Compute the new ratio.
-		fraction := crit / (crit + float64(slept))
-
-		// Set a lower bound on the fraction.
-		// Due to OS-related anomalies we may "sleep" for an inordinate amount
-		// of time. Let's avoid letting the ratio get out of hand by bounding
-		// the sleep time we use in our EWMA.
-		const minFraction = 1.0 / 1000.0
-		if fraction < minFraction {
-			fraction = minFraction
-		}
-
-		// Update scavengeEWMA by merging in the new crit/slept ratio.
-		const alpha = 0.5
-		scavengeEWMA = alpha*fraction + (1-alpha)*scavengeEWMA
+		// Update the critSleepRatio, adjusting until we reach our ideal fraction.
+		critSleepRatio = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit)
 	}
 }

@ -395,16 +399,7 @@ func bgscavenge(c chan int) {
 // back to the top of the heap.
 //
 // Returns the amount of memory scavenged in bytes.
-//
-// p.mheapLock must be held, but may be temporarily released if
-// mayUnlock == true.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
-func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
-	assertLockHeld(p.mheapLock)
-
+func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
 	var (
 		addrs addrRange
 		gen   uint32
@ -416,9 +411,11 @@ func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 				break
 			}
 		}
-		r, a := p.scavengeOne(addrs, nbytes-released, mayUnlock)
-		released += r
-		addrs = a
+		systemstack(func() {
+			r, a := p.scavengeOne(addrs, nbytes-released)
+			released += r
+			addrs = a
+		})
 	}
 	// Only unreserve the space which hasn't been scavenged or searched
 	// to ensure we always make progress.
@ -456,8 +453,9 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 func (p *pageAlloc) scavengeStartGen() {
 	assertLockHeld(p.mheapLock)

+	lock(&p.scav.lock)
 	if debug.scavtrace > 0 {
-		printScavTrace(p.scav.gen, p.scav.released, false)
+		printScavTrace(p.scav.gen, atomic.Loaduintptr(&p.scav.released), false)
 	}
 	p.inUse.cloneInto(&p.scav.inUse)

@ -487,9 +485,10 @@ func (p *pageAlloc) scavengeStartGen() {
 	// arena in size, so virtually every heap has the scavenger on.
 	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
 	p.scav.gen++
-	p.scav.released = 0
+	atomic.Storeuintptr(&p.scav.released, 0)
 	p.scav.freeHWM = minOffAddr
 	p.scav.scavLWM = maxOffAddr
+	unlock(&p.scav.lock)
 }

 // scavengeReserve reserves a contiguous range of the address space
@ -498,14 +497,9 @@ func (p *pageAlloc) scavengeStartGen() {
 // first.
 //
 // Returns the reserved range and the scavenge generation number for it.
-//
-// p.mheapLock must be held.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
 func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
-	assertLockHeld(p.mheapLock)
+	lock(&p.scav.lock)
+	gen := p.scav.gen

 	// Start by reserving the minimum.
 	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
@ -513,7 +507,8 @@ func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	// Return early if the size is zero; we don't want to use
 	// the bogus address below.
 	if r.size() == 0 {
-		return r, p.scav.gen
+		unlock(&p.scav.lock)
+		return r, gen
 	}

 	// The scavenger requires that base be aligned to a
@ -524,28 +519,26 @@ func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {

 	// Remove from inUse however much extra we just pulled out.
 	p.scav.inUse.removeGreaterEqual(newBase)
+	unlock(&p.scav.lock)
+
 	r.base = offAddr{newBase}
-	return r, p.scav.gen
+	return r, gen
 }

 // scavengeUnreserve returns an unscavenged portion of a range that was
 // previously reserved with scavengeReserve.
-//
-// p.mheapLock must be held.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
 func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
-	assertLockHeld(p.mheapLock)
-
-	if r.size() == 0 || gen != p.scav.gen {
+	if r.size() == 0 {
 		return
 	}
 	if r.base.addr()%pallocChunkBytes != 0 {
 		throw("unreserving unaligned region")
 	}
-	p.scav.inUse.add(r)
+	lock(&p.scav.lock)
+	if gen == p.scav.gen {
+		p.scav.inUse.add(r)
+	}
+	unlock(&p.scav.lock)
 }

 // scavengeOne walks over address range work until it finds
@ -559,15 +552,10 @@ func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
 //
 // work's base address must be aligned to pallocChunkBytes.
 //
-// p.mheapLock must be held, but may be temporarily released if
-// mayUnlock == true.
-//
-// Must run on the system stack because p.mheapLock must be held.
+// Must run on the systemstack because it acquires p.mheapLock.
 //
 //go:systemstack
-func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
-	assertLockHeld(p.mheapLock)
-
+func (p *pageAlloc) scavengeOne(work addrRange, max uintptr) (uintptr, addrRange) {
 	// Defensively check if we've received an empty address range.
 	// If so, just return.
 	if work.size() == 0 {
@ -599,40 +587,12 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		minPages = 1
 	}

-	// Helpers for locking and unlocking only if mayUnlock == true.
-	lockHeap := func() {
-		if mayUnlock {
-			lock(p.mheapLock)
-		}
+	// Fast path: check the chunk containing the top-most address in work.
+	if r, w := p.scavengeOneFast(work, minPages, maxPages); r != 0 {
+		return r, w
+	} else {
+		work = w
 	}
-	unlockHeap := func() {
-		if mayUnlock {
-			unlock(p.mheapLock)
-		}
-	}
-
-	// Fast path: check the chunk containing the top-most address in work,
-	// starting at that address's page index in the chunk.
-	//
-	// Note that work.end() is exclusive, so get the chunk we care about
-	// by subtracting 1.
-	maxAddr := work.limit.addr() - 1
-	maxChunk := chunkIndex(maxAddr)
-	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
-		// We only bother looking for a candidate if there at least
-		// minPages free pages at all.
-		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
-
-		// If we found something, scavenge it and return!
-		if npages != 0 {
-			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
-
-			assertLockHeld(p.mheapLock) // Must be locked on return.
-			return uintptr(npages) * pageSize, work
-		}
-	}
-	// Update the limit to reflect the fact that we checked maxChunk already.
-	work.limit = offAddr{chunkBase(maxChunk)}

 	// findCandidate finds the next scavenge candidate in work optimistically.
 	//
@ -671,37 +631,61 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// looking for any free and unscavenged page. If we think we see something,
 	// lock and verify it!
 	for work.size() != 0 {
-		unlockHeap()

 		// Search for the candidate.
 		candidateChunkIdx, ok := findCandidate(work)
-
-		// Lock the heap. We need to do this now if we found a candidate or not.
-		// If we did, we'll verify it. If not, we need to lock before returning
-		// anyway.
-		lockHeap()
-
 		if !ok {
 			// We didn't find a candidate, so we're done.
 			work.limit = work.base
 			break
 		}

+		// Lock, so we can verify what we found.
+		lock(p.mheapLock)
+
 		// Find, verify, and scavenge if we can.
 		chunk := p.chunkOf(candidateChunkIdx)
 		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 		if npages > 0 {
 			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
-
-			assertLockHeld(p.mheapLock) // Must be locked on return.
+			unlock(p.mheapLock)
 			return uintptr(npages) * pageSize, work
 		}
+		unlock(p.mheapLock)

 		// We were fooled, so let's continue from where we left off.
 		work.limit = offAddr{chunkBase(candidateChunkIdx)}
 	}
+	return 0, work
+}

-	assertLockHeld(p.mheapLock) // Must be locked on return.
+// scavengeOneFast is the fast path for scavengeOne, which just checks the top
+// chunk of work for some pages to scavenge.
+//
+// Must run on the system stack because it acquires the heap lock.
+//
+//go:systemstack
+func (p *pageAlloc) scavengeOneFast(work addrRange, minPages, maxPages uintptr) (uintptr, addrRange) {
+	maxAddr := work.limit.addr() - 1
+	maxChunk := chunkIndex(maxAddr)
+
+	lock(p.mheapLock)
+	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
+		// We only bother looking for a candidate if there at least
+		// minPages free pages at all.
+		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+
+		// If we found something, scavenge it and return!
+		if npages != 0 {
+			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
+			unlock(p.mheapLock)
+			return uintptr(npages) * pageSize, work
+		}
+	}
+	unlock(p.mheapLock)
+
+	// Update the limit to reflect the fact that we checked maxChunk already.
+	work.limit = offAddr{chunkBase(maxChunk)}
 	return 0, work
 }

@ -712,38 +696,57 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 //
 // Returns the base address of the scavenged region.
 //
-// p.mheapLock must be held.
+// p.mheapLock must be held. Unlocks p.mheapLock but reacquires
+// it before returning. Must be run on the systemstack as a result.
+//
+//go:systemstack
 func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
 	assertLockHeld(p.mheapLock)

-	p.chunkOf(ci).scavenged.setRange(base, npages)
-
 	// Compute the full address for the start of the range.
 	addr := chunkBase(ci) + uintptr(base)*pageSize

+	// Mark the range we're about to scavenge as allocated, because
+	// we don't want any allocating goroutines to grab it while
+	// the scavenging is in progress.
+	if scav := p.allocRange(addr, uintptr(npages)); scav != 0 {
+		throw("double scavenge")
+	}
+
+	// With that done, it's safe to unlock.
+	unlock(p.mheapLock)
+
 	// Update the scavenge low watermark.
+	lock(&p.scav.lock)
 	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
 		p.scav.scavLWM = oAddr
 	}
+	unlock(&p.scav.lock)

-	// Only perform the actual scavenging if we're not in a test.
-	// It's dangerous to do so otherwise.
-	if p.test {
-		return addr
+	if !p.test {
+		// Only perform the actual scavenging if we're not in a test.
+		// It's dangerous to do so otherwise.
+		sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
+
+		// Update global accounting only when not in test, otherwise
+		// the runtime's accounting will be wrong.
+		nbytes := int64(npages) * pageSize
+		atomic.Xadd64(&memstats.heap_released, nbytes)
+
+		// Update consistent accounting too.
+		stats := memstats.heapStats.acquire()
+		atomic.Xaddint64(&stats.committed, -nbytes)
+		atomic.Xaddint64(&stats.released, nbytes)
+		memstats.heapStats.release()
 	}
-	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)

-	// Update global accounting only when not in test, otherwise
-	// the runtime's accounting will be wrong.
-	nbytes := int64(npages) * pageSize
-	atomic.Xadd64(&memstats.heap_released, nbytes)
-
-	// Update consistent accounting too.
-	stats := memstats.heapStats.acquire()
-	atomic.Xaddint64(&stats.committed, -nbytes)
-	atomic.Xaddint64(&stats.released, nbytes)
-	memstats.heapStats.release()
+	// Relock the heap, because now we need to make these pages
+	// available allocation. Free them back to the page allocator.
+	lock(p.mheapLock)
+	p.free(addr, uintptr(npages), true)

+	// Mark the range as scavenged.
+	p.chunkOf(ci).scavenged.setRange(base, npages)
 	return addr
 }

--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@ -430,12 +430,12 @@ func TestPageAllocScavenge(t *testing.T) {
 	}
 	for name, v := range tests {
 		v := v
-		runTest := func(t *testing.T, mayUnlock bool) {
+		t.Run(name, func(t *testing.T) {
 			b := NewPageAlloc(v.beforeAlloc, v.beforeScav)
 			defer FreePageAlloc(b)

 			for iter, h := range v.expect {
-				if got := b.Scavenge(h.request, mayUnlock); got != h.expect {
+				if got := b.Scavenge(h.request); got != h.expect {
 					t.Fatalf("bad scavenge #%d: want %d, got %d", iter+1, h.expect, got)
 				}
 			}
@ -443,12 +443,6 @@ func TestPageAllocScavenge(t *testing.T) {
 			defer FreePageAlloc(want)

 			checkPageAlloc(t, want, b)
-		}
-		t.Run(name, func(t *testing.T) {
-			runTest(t, false)
-		})
-		t.Run(name+"MayUnlock", func(t *testing.T) {
-			runTest(t, true)
 		})
 	}
 }
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@ -80,7 +80,7 @@ type mheap struct {
 	// access (since that may free the backing store).
 	allspans []*mspan // all spans out there

-	_ uint32 // align uint64 fields on 32-bit for atomics
+	// _ uint32 // align uint64 fields on 32-bit for atomics

 	// Proportional sweep
 	//
@ -1120,6 +1120,7 @@ func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass
 	// Function-global state.
 	gp := getg()
 	base, scav := uintptr(0), uintptr(0)
+	growth := uintptr(0)

 	// On some platforms we need to provide physical page aligned stack
 	// allocations. Where the page size is less than the physical page
@ -1165,7 +1166,9 @@ func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass
 		// Try to acquire a base address.
 		base, scav = h.pages.alloc(npages)
 		if base == 0 {
-			if !h.grow(npages) {
+			var ok bool
+			growth, ok = h.grow(npages)
+			if !ok {
 				unlock(&h.lock)
 				return nil
 			}
@ -1189,16 +1192,35 @@ func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass
 		// Return memory around the aligned allocation.
 		spaceBefore := base - allocBase
 		if spaceBefore > 0 {
-			h.pages.free(allocBase, spaceBefore/pageSize)
+			h.pages.free(allocBase, spaceBefore/pageSize, false)
 		}
 		spaceAfter := (allocPages-npages)*pageSize - spaceBefore
 		if spaceAfter > 0 {
-			h.pages.free(base+npages*pageSize, spaceAfter/pageSize)
+			h.pages.free(base+npages*pageSize, spaceAfter/pageSize, false)
 		}
 	}

 	unlock(&h.lock)

+	if growth > 0 {
+		// We just caused a heap growth, so scavenge down what will soon be used.
+		// By scavenging inline we deal with the failure to allocate out of
+		// memory fragments by scavenging the memory fragments that are least
+		// likely to be re-used.
+		scavengeGoal := atomic.Load64(&h.scavengeGoal)
+		if retained := heapRetained(); retained+uint64(growth) > scavengeGoal {
+			// The scavenging algorithm requires the heap lock to be dropped so it
+			// can acquire it only sparingly. This is a potentially expensive operation
+			// so it frees up other goroutines to allocate in the meanwhile. In fact,
+			// they can make use of the growth we just created.
+			todo := growth
+			if overage := uintptr(retained + uint64(growth) - scavengeGoal); todo > overage {
+				todo = overage
+			}
+			h.pages.scavenge(todo)
+		}
+	}
+
 HaveSpan:
 	// At this point, both s != nil and base != 0, and the heap
 	// lock is no longer held. Initialize the span.
@ -1311,10 +1333,10 @@ HaveSpan:
 }

 // Try to add at least npage pages of memory to the heap,
-// returning whether it worked.
+// returning how much the heap grew by and whether it worked.
 //
 // h.lock must be held.
-func (h *mheap) grow(npage uintptr) bool {
+func (h *mheap) grow(npage uintptr) (uintptr, bool) {
 	assertLockHeld(&h.lock)

 	// We must grow the heap in whole palloc chunks.
@ -1336,7 +1358,7 @@ func (h *mheap) grow(npage uintptr) bool {
 		av, asize := h.sysAlloc(ask)
 		if av == nil {
 			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
-			return false
+			return 0, false
 		}

 		if uintptr(av) == h.curArena.end {
@ -1396,20 +1418,7 @@ func (h *mheap) grow(npage uintptr) bool {
 	// space ready for allocation.
 	h.pages.grow(v, nBase-v)
 	totalGrowth += nBase - v
-
-	// We just caused a heap growth, so scavenge down what will soon be used.
-	// By scavenging inline we deal with the failure to allocate out of
-	// memory fragments by scavenging the memory fragments that are least
-	// likely to be re-used.
-	scavengeGoal := atomic.Load64(&h.scavengeGoal)
-	if retained := heapRetained(); retained+uint64(totalGrowth) > scavengeGoal {
-		todo := totalGrowth
-		if overage := uintptr(retained + uint64(totalGrowth) - scavengeGoal); todo > overage {
-			todo = overage
-		}
-		h.pages.scavenge(todo, false)
-	}
-	return true
+	return totalGrowth, true
 }

 // Free the span back into the heap.
@ -1499,7 +1508,7 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	memstats.heapStats.release()

 	// Mark the space as free.
-	h.pages.free(s.base(), s.npages)
+	h.pages.free(s.base(), s.npages, false)

 	// Free the span structure. We no longer have a use for it.
 	s.state.set(mSpanDead)
@ -1515,13 +1524,19 @@ func (h *mheap) scavengeAll() {
 	// the mheap API.
 	gp := getg()
 	gp.m.mallocing++
+
 	lock(&h.lock)
 	// Start a new scavenge generation so we have a chance to walk
 	// over the whole heap.
 	h.pages.scavengeStartGen()
-	released := h.pages.scavenge(^uintptr(0), false)
-	gen := h.pages.scav.gen
 	unlock(&h.lock)
+
+	released := h.pages.scavenge(^uintptr(0))
+
+	lock(&h.pages.scav.lock)
+	gen := h.pages.scav.gen
+	unlock(&h.pages.scav.lock)
+
 	gp.m.mallocing--

 	if debug.scavtrace > 0 {
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@ -226,6 +226,8 @@ type pageAlloc struct {
 	// are currently available. Otherwise one might iterate over unused
 	// ranges.
 	//
+	// Protected by mheapLock.
+	//
 	// TODO(mknyszek): Consider changing the definition of the bitmap
 	// such that 1 means free and 0 means in-use so that summaries and
 	// the bitmaps align better on zero-values.
@ -261,29 +263,41 @@ type pageAlloc struct {
 	inUse addrRanges

 	// scav stores the scavenger state.
-	//
-	// All fields are protected by mheapLock.
 	scav struct {
+		lock mutex
+
 		// inUse is a slice of ranges of address space which have not
 		// yet been looked at by the scavenger.
+		//
+		// Protected by lock.
 		inUse addrRanges

 		// gen is the scavenge generation number.
+		//
+		// Protected by lock.
 		gen uint32

 		// reservationBytes is how large of a reservation should be made
 		// in bytes of address space for each scavenge iteration.
+		//
+		// Protected by lock.
 		reservationBytes uintptr

 		// released is the amount of memory released this generation.
+		//
+		// Updated atomically.
 		released uintptr

 		// scavLWM is the lowest (offset) address that the scavenger reached this
 		// scavenge generation.
+		//
+		// Protected by lock.
 		scavLWM offAddr

 		// freeHWM is the highest (offset) address of a page that was freed to
 		// the page allocator this scavenge generation.
+		//
+		// Protected by mheapLock.
 		freeHWM offAddr
 	}

@ -864,17 +878,19 @@ Found:
 // Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (p *pageAlloc) free(base, npages uintptr) {
+func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
 	assertLockHeld(p.mheapLock)

 	// If we're freeing pages below the p.searchAddr, update searchAddr.
 	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
 		p.searchAddr = b
 	}
-	// Update the free high watermark for the scavenger.
 	limit := base + npages*pageSize - 1
-	if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
-		p.scav.freeHWM = offLimit
+	if !scavenged {
+		// Update the free high watermark for the scavenger.
+		if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
+			p.scav.freeHWM = offLimit
+		}
 	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@ -790,7 +790,15 @@ type consistentHeapStats struct {
 //
 // The caller's P must not change between acquire and
 // release. This also means that the caller should not
-// acquire a P or release its P in between.
+// acquire a P or release its P in between. A P also must
+// not acquire a given consistentHeapStats if it hasn't
+// yet released it.
+//
+// nosplit because a stack growth in this function could
+// lead to a stack allocation that could reenter the
+// function.
+//
+//go:nosplit
 func (m *consistentHeapStats) acquire() *heapStatsDelta {
 	if pp := getg().m.p.ptr(); pp != nil {
 		seq := atomic.Xadd(&pp.statsSeq, 1)
@ -814,6 +822,12 @@ func (m *consistentHeapStats) acquire() *heapStatsDelta {
 // The caller's P must not change between acquire and
 // release. This also means that the caller should not
 // acquire a P or release its P in between.
+//
+// nosplit because a stack growth in this function could
+// lead to a stack allocation that causes another acquire
+// before this operation has completed.
+//
+//go:nosplit
 func (m *consistentHeapStats) release() {
 	if pp := getg().m.p.ptr(); pp != nil {
 		seq := atomic.Xadd(&pp.statsSeq, 1)
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@ -315,6 +315,7 @@ var debug struct {
 	schedtrace         int32
 	tracebackancestors int32
 	asyncpreemptoff    int32
+	harddecommit       int32

 	// debug.malloc is used as a combined debug check
 	// in the malloc function and should be set
@ -344,6 +345,7 @@ var dbgvars = []dbgVar{
 	{"tracebackancestors", &debug.tracebackancestors},
 	{"asyncpreemptoff", &debug.asyncpreemptoff},
 	{"inittrace", &debug.inittrace},
+	{"harddecommit", &debug.harddecommit},
 }

 func parsedebugvars() {
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@ -1002,7 +1002,7 @@ func newstack() {
 	// NOTE: stackguard0 may change underfoot, if another thread
 	// is about to try to preempt gp. Read it just once and use that same
 	// value now and below.
-	preempt := atomic.Loaduintptr(&gp.stackguard0) == stackPreempt
+	stackguard0 := atomic.Loaduintptr(&gp.stackguard0)

 	// Be conservative about where we preempt.
 	// We are interested in preempting user Go code, not runtime code.
@ -1016,6 +1016,7 @@ func newstack() {
 	// If the GC is in some way dependent on this goroutine (for example,
 	// it needs a lock held by the goroutine), that small preemption turns
 	// into a real deadlock.
+	preempt := stackguard0 == stackPreempt
 	if preempt {
 		if !canPreemptM(thisg.m) {
 			// Let the goroutine keep running for now.
@ -1083,7 +1084,7 @@ func newstack() {
 		}
 	}

-	if gp.stackguard0 == stackForceMove {
+	if stackguard0 == stackForceMove {
 		// Forced stack movement used for debugging.
 		// Don't double the stack (or we may quickly run out
 		// if this is done repeatedly).
--- a/src/runtime/traceback_test.go
+++ b/src/runtime/traceback_test.go
@ -353,6 +353,9 @@ func testTracebackArgs8d(a testArgsType8d) int {
 	return n
 }

+// nosplit to avoid preemption or morestack spilling registers.
+//
+//go:nosplit
 //go:noinline
 func testTracebackArgs9(a int64, b int32, c int16, d int8, x [2]int, y int) int {
 	if a < 0 {
@ -366,6 +369,9 @@ func testTracebackArgs9(a int64, b int32, c int16, d int8, x [2]int, y int) int
 	return n
 }

+// nosplit to avoid preemption or morestack spilling registers.
+//
+//go:nosplit
 //go:noinline
 func testTracebackArgs10(a, b, c, d, e int32) int {
 	// no use of any args
@ -373,8 +379,10 @@ func testTracebackArgs10(a, b, c, d, e int32) int {
 }

 // norace to avoid race instrumentation changing spill locations.
+// nosplit to avoid preemption or morestack spilling registers.
 //
 //go:norace
+//go:nosplit
 //go:noinline
 func testTracebackArgs11a(a, b, c int32) int {
 	if a < 0 {
@ -387,8 +395,10 @@ func testTracebackArgs11a(a, b, c int32) int {
 }

 // norace to avoid race instrumentation changing spill locations.
+// nosplit to avoid preemption or morestack spilling registers.
 //
 //go:norace
+//go:nosplit
 //go:noinline
 func testTracebackArgs11b(a, b, c, d int32) int {
 	var x int32
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@ -706,7 +706,8 @@ func isSeparator(r rune) bool {
 // Title returns a copy of the string s with all Unicode letters that begin words
 // mapped to their Unicode title case.
 //
-// BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
+// Deprecated: The rule Title uses for word boundaries does not handle Unicode
+// punctuation properly. Use golang.org/x/text/cases instead.
 func Title(s string) string {
 	// Use a closure here to remember state.
 	// Hackish but effective. Depends on Map scanning in order and calling
--- a/src/syscall/mkall.sh
+++ b/src/syscall/mkall.sh
@ -283,7 +283,7 @@ netbsd_arm64)
 	mktypes="GOARCH=$GOARCH go tool cgo -godefs"
 	;;
 openbsd_386)
-	GOOSARCH_in="syscall_openbsd1.go syscall_openbsd_$GOARCH.go"
+	GOOSARCH_in="syscall_openbsd_libc.go syscall_openbsd_$GOARCH.go"
 	mkerrors="$mkerrors -m32"
 	mksyscall="./mksyscall.pl -l32 -openbsd -libc"
 	mksysctl="./mksysctl_openbsd.pl"
--- a/src/time/example_test.go
+++ b/src/time/example_test.go
@ -344,6 +344,23 @@ func ExampleTime_Format_pad() {

 }

+func ExampleTime_GoString() {
+	t := time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC)
+	fmt.Println(t.GoString())
+	t = t.Add(1 * time.Minute)
+	fmt.Println(t.GoString())
+	t = t.AddDate(0, 1, 0)
+	fmt.Println(t.GoString())
+	t, _ = time.Parse("Jan 2, 2006 at 3:04pm (MST)", "Feb 3, 2013 at 7:54pm (UTC)")
+	fmt.Println(t.GoString())
+
+	// Output:
+	// time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC)
+	// time.Date(2009, time.November, 10, 23, 1, 0, 0, time.UTC)
+	// time.Date(2009, time.December, 10, 23, 1, 0, 0, time.UTC)
+	// time.Date(2013, time.February, 3, 19, 54, 0, 0, time.UTC)
+}
+
 func ExampleParse() {
 	// See the example for Time.Format for a thorough description of how
 	// to define the layout string to parse a time.Time value; Parse and
@ -401,6 +418,39 @@ func ExampleParseInLocation() {
 	// 2012-07-09 00:00:00 +0200 CEST
 }

+func ExampleUnix() {
+	unixTime := time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC)
+	fmt.Println(unixTime.Unix())
+	t := time.Unix(unixTime.Unix(), 0).UTC()
+	fmt.Println(t)
+
+	// Output:
+	// 1257894000
+	// 2009-11-10 23:00:00 +0000 UTC
+}
+
+func ExampleUnixMicro() {
+	umt := time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC)
+	fmt.Println(umt.UnixMicro())
+	t := time.UnixMicro(umt.UnixMicro()).UTC()
+	fmt.Println(t)
+
+	// Output:
+	// 1257894000000000
+	// 2009-11-10 23:00:00 +0000 UTC
+}
+
+func ExampleUnixMilli() {
+	umt := time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC)
+	fmt.Println(umt.UnixMilli())
+	t := time.UnixMilli(umt.UnixMilli()).UTC()
+	fmt.Println(t)
+
+	// Output:
+	// 1257894000000
+	// 2009-11-10 23:00:00 +0000 UTC
+}
+
 func ExampleTime_Unix() {
 	// 1 billion seconds of Unix, three ways.
 	fmt.Println(time.Unix(1e9, 0).UTC())     // 1e9 seconds
--- a/src/time/tick.go
+++ b/src/time/tick.go
@ -48,8 +48,12 @@ func (t *Ticker) Stop() {
 }

 // Reset stops a ticker and resets its period to the specified duration.
-// The next tick will arrive after the new period elapses.
+// The next tick will arrive after the new period elapses. The duration d
+// must be greater than zero; if not, Reset will panic.
 func (t *Ticker) Reset(d Duration) {
+	if d <= 0 {
+		panic("non-positive interval for Ticker.Reset")
+	}
 	if t.r.f == nil {
 		panic("time: Reset called on uninitialized Ticker")
 	}
--- a/src/time/tick_test.go
+++ b/src/time/tick_test.go
@ -134,6 +134,17 @@ func TestNewTickerLtZeroDuration(t *testing.T) {
 	NewTicker(-1)
 }

+// Test that Ticker.Reset panics when given a duration less than zero.
+func TestTickerResetLtZeroDuration(t *testing.T) {
+	defer func() {
+		if err := recover(); err == nil {
+			t.Errorf("Ticker.Reset(0) should have panicked")
+		}
+	}()
+	tk := NewTicker(Second)
+	tk.Reset(0)
+}
+
 func BenchmarkTicker(b *testing.B) {
 	benchmark(b, func(n int) {
 		ticker := NewTicker(Nanosecond)
--- a/src/time/time.go
+++ b/src/time/time.go
@ -1433,17 +1433,17 @@ func Date(year int, month Month, day, hour, min, sec, nsec int, loc *Location) T

 	unix := int64(abs) + (absoluteToInternal + internalToUnix)

-	// Look for zone offset for t, so we can adjust to UTC.
-	// The lookup function expects UTC, so we pass t in the
+	// Look for zone offset for expected time, so we can adjust to UTC.
+	// The lookup function expects UTC, so first we pass unix in the
 	// hope that it will not be too close to a zone transition,
 	// and then adjust if it is.
 	_, offset, start, end, _ := loc.lookup(unix)
 	if offset != 0 {
-		switch utc := unix - int64(offset); {
-		case utc < start:
-			_, offset, _, _, _ = loc.lookup(start - 1)
-		case utc >= end:
-			_, offset, _, _, _ = loc.lookup(end)
+		utc := unix - int64(offset)
+		// If utc is valid for the time zone we found, then we have the right offset.
+		// If not, we get the correct offset by looking up utc in the location.
+		if utc < start || utc >= end {
+			_, offset, _, _, _ = loc.lookup(utc)
 		}
 		unix -= int64(offset)
 	}
--- a/src/time/time_test.go
+++ b/src/time/time_test.go
@ -1616,3 +1616,45 @@ func TestTimeAddSecOverflow(t *testing.T) {
 		}
 	}
 }
+
+// Issue 49284: time: ParseInLocation incorrectly because of Daylight Saving Time
+func TestTimeWithZoneTransition(t *testing.T) {
+	ForceZipFileForTesting(true)
+	defer ForceZipFileForTesting(false)
+
+	loc, err := LoadLocation("Asia/Shanghai")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	tests := [...]struct {
+		give Time
+		want Time
+	}{
+		// 14 Apr 1991 - Daylight Saving Time Started
+		// When time of "Asia/Shanghai" was about to reach
+		// Sunday, 14 April 1991, 02:00:00 clocks were turned forward 1 hour to
+		// Sunday, 14 April 1991, 03:00:00 local daylight time instead.
+		// The UTC time was 13 April 1991, 18:00:00
+		0: {Date(1991, April, 13, 17, 50, 0, 0, loc), Date(1991, April, 13, 9, 50, 0, 0, UTC)},
+		1: {Date(1991, April, 13, 18, 0, 0, 0, loc), Date(1991, April, 13, 10, 0, 0, 0, UTC)},
+		2: {Date(1991, April, 14, 1, 50, 0, 0, loc), Date(1991, April, 13, 17, 50, 0, 0, UTC)},
+		3: {Date(1991, April, 14, 3, 0, 0, 0, loc), Date(1991, April, 13, 18, 0, 0, 0, UTC)},
+
+		// 15 Sep 1991 - Daylight Saving Time Ended
+		// When local daylight time of "Asia/Shanghai" was about to reach
+		// Sunday, 15 September 1991, 02:00:00 clocks were turned backward 1 hour to
+		// Sunday, 15 September 1991, 01:00:00 local standard time instead.
+		// The UTC time was 14 September 1991, 17:00:00
+		4: {Date(1991, September, 14, 16, 50, 0, 0, loc), Date(1991, September, 14, 7, 50, 0, 0, UTC)},
+		5: {Date(1991, September, 14, 17, 0, 0, 0, loc), Date(1991, September, 14, 8, 0, 0, 0, UTC)},
+		6: {Date(1991, September, 15, 0, 50, 0, 0, loc), Date(1991, September, 14, 15, 50, 0, 0, UTC)},
+		7: {Date(1991, September, 15, 2, 00, 0, 0, loc), Date(1991, September, 14, 18, 00, 0, 0, UTC)},
+	}
+
+	for i, tt := range tests {
+		if !tt.give.Equal(tt.want) {
+			t.Errorf("#%d:: %#v is not equal to %#v", i, tt.give.Format(RFC3339), tt.want.Format(RFC3339))
+		}
+	}
+}
--- a/src/unicode/utf8/example_test.go
+++ b/src/unicode/utf8/example_test.go
@ -214,3 +214,13 @@ func ExampleValidString() {
 	// true
 	// false
 }
+
+func ExampleAppendRune() {
+	buf1 := utf8.AppendRune(nil, 0x10000)
+	buf2 := utf8.AppendRune([]byte("init"), 0x10000)
+	fmt.Println(string(buf1))
+	fmt.Println(string(buf2))
+	// Output:
+	// 𐀀
+	// init𐀀
+}
--- a/src/unicode/utf8/utf8_test.go
+++ b/src/unicode/utf8/utf8_test.go
@ -133,7 +133,7 @@ func TestAppendRune(t *testing.T) {
 			t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
 		}
 		if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str {
-			t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
+			t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
 		}
 	}
 }
--- a/test/fixedbugs/issue49378.go
+++ b/test/fixedbugs/issue49378.go
@ -0,0 +1,25 @@
+// compile
+
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package p
+
+func f(i int) {
+	var s1 struct {
+		s struct{ s struct{ i int } }
+	}
+	var s2, s3 struct {
+		a struct{ i int }
+		b int
+	}
+	func() {
+		i = 1 + 2*i + s3.a.i + func() int {
+			s2.a, s2.b = s3.a, s3.b
+			return 0
+		}() + func(*int) int {
+			return s1.s.s.i
+		}(new(int))
+	}()
+}