diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go
index 28a39ea145..21bf8b76a5 100644
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@@ -785,8 +785,7 @@ var depsRules = `
 	# Test-only packages can have anything they want
 	FMT, compress/gzip, embed, encoding/binary < encoding/json/internal/jsontest;
 	CGO, internal/syscall/unix < net/internal/cgotest;
-
-
+	FMT < math/big/internal/asmgen;
 `
 
 // listStdPkgs returns the same list of packages as "go list std".
diff --git a/src/math/big/internal/asmgen/add.go b/src/math/big/internal/asmgen/add.go
new file mode 100644
index 0000000000..ee15e3a96f
--- /dev/null
+++ b/src/math/big/internal/asmgen/add.go
@@ -0,0 +1,57 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+// addOrSubVV generates addVV or subVV,
+// which do z, c = x ± y.
+// The caller guarantees that len(z) == len(x) == len(y).
+func addOrSubVV(a *Asm, name string) {
+	f := a.Func("func " + name + "(z, x, y []Word) (c Word)")
+
+	add := a.Add
+	which := AddCarry
+	if name == "subVV" {
+		add = a.Sub
+		which = SubCarry
+	}
+
+	n := f.Arg("z_len")
+	p := f.Pipe()
+	p.SetHint("y", HintMemOK) // allow y to be used from memory on x86
+	p.Start(n, 1, 4)
+	var c Reg
+	if !a.Arch.CarrySafeLoop {
+		// Carry smashed by loop tests; allocate and save in register
+		// around unrolled blocks.
+		c = a.Reg()
+		a.Mov(a.Imm(0), c)
+		a.EOL("clear saved carry")
+		p.AtUnrollStart(func() { a.RestoreCarry(c); a.Free(c) })
+		p.AtUnrollEnd(func() { a.Unfree(c); a.SaveCarry(c) })
+	} else {
+		// Carry preserved by loop; clear now, ahead of loop
+		// (but after Start, which may have modified it).
+		a.ClearCarry(which)
+	}
+	p.Loop(func(in, out [][]Reg) {
+		for i, x := range in[0] {
+			y := in[1][i]
+			add(y, x, x, SetCarry|UseCarry)
+		}
+		p.StoreN(in[:1])
+	})
+	p.Done()
+
+	// Copy carry to output.
+	if c.Valid() {
+		a.ConvertCarry(which, c)
+	} else {
+		c = a.RegHint(HintCarry)
+		a.SaveConvertCarry(which, c)
+	}
+	f.StoreArg(c, "c")
+	a.Free(c)
+	a.Ret()
+}
diff --git a/src/math/big/internal/asmgen/arch.go b/src/math/big/internal/asmgen/arch.go
new file mode 100644
index 0000000000..bcba3992a9
--- /dev/null
+++ b/src/math/big/internal/asmgen/arch.go
@@ -0,0 +1,238 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+import (
+	"fmt"
+	"strings"
+)
+
+// Note: Exported fields and methods are expected to be used
+// by function generators (like the ones in add.go and so on).
+// Unexported fields and methods should not be.
+
+// An Arch defines how to generate assembly for a specific architecture.
+type Arch struct {
+	Name          string // name of architecture
+	Build         string // build tag
+	WordBits      int    // length of word in bits (32 or 64)
+	WordBytes     int    // length of word in bytes (4 or 8)
+	CarrySafeLoop bool   // whether loops preserve carry flag across iterations
+
+	// Registers.
+	regs        []string // usable general registers, in allocation order
+	reg0        string   // dedicated zero register
+	regCarry    string   // dedicated carry register
+	regAltCarry string   // dedicated secondary carry register
+	regTmp      string   // dedicated temporary register
+
+	// setup is called to emit any per-architecture function prologue,
+	// immediately after the TEXT line has been emitted.
+	// If setup is nil, it is taken to be a no-op.
+	setup func(*Func)
+
+	// hint returns the register to use for a given hint.
+	// Returning an empty string indicates no preference.
+	// If hint is nil, it is considered to return an empty string.
+	hint func(*Asm, Hint) string
+
+	// op3 reports whether the named opcode accepts 3 operands
+	// (true on most instructions on most systems, but not true of x86 instructions).
+	// The assembler unconditionally turns op x,z,z into op x,z.
+	// If op3 returns false, then the assembler will turn op x,y,z into mov y,z; op x,z.
+	// If op3 is nil, then all opcodes are assumed to accept 3 operands.
+	op3 func(name string) bool
+
+	// memOK indicates that arithmetic instructions can use memory references (like on x86)
+	memOK bool
+
+	// maxColumns is the default maximum number of vector columns
+	// to process in a single [Pipe.Loop] block.
+	// 0 means unlimited.
+	// [Pipe.SetMaxColumns] overrides this.
+	maxColumns int
+
+	// Instruction names.
+	mov   string // move (word-sized)
+	add   string // add with no carry involvement
+	adds  string // add, setting but not using carry
+	adc   string // add, using but not setting carry
+	adcs  string // add, setting and using carry
+	sub   string // sub with no carry involvement
+	subs  string // sub, setting but not using carry
+	sbc   string // sub, using but not setting carry
+	sbcs  string // sub, setting and using carry
+	mul   string // multiply
+	mulhi string // multiply producing high bits
+	lsh   string // left shift
+	lshd  string // double-width left shift
+	rsh   string // right shift
+	rshd  string // double-width right shift
+	and   string // bitwise and
+	or    string // bitwise or
+	xor   string // bitwise xor
+	neg   string // negate
+	rsb   string // reverse subtract
+	sltu  string // set less-than unsigned (dst = src2 < src1), for carry-less systems
+	sgtu  string // set greater-than unsigned (dst = src2 > src1), for carry-less systems
+	lea   string // load effective address
+
+	// addF and subF implement a.Add and a.Sub
+	// on systems where the situation is more complicated than
+	// the six basic instructions (add, adds, adcs, sub, subs, sbcs).
+	// They return a boolean indicating whether the operation was handled.
+	addF func(a *Asm, src1, src2, dst Reg, carry Carry) bool
+	subF func(a *Asm, src1, src2, dst Reg, carry Carry) bool
+
+	// lshF and rshF implement a.Lsh and a.Rsh
+	// on systems where the situation is more complicated than
+	// a simple instruction opcode.
+	// They must succeed.
+	lshF func(a *Asm, shift, src, dst Reg)
+	rshF func(a *Asm, shift, src, dst Reg)
+
+	// mulF and mulWideF implement Mul and MulWide.
+	// They call Fatalf if the operation is unsupported.
+	// An architecture can set the mul field instead of mulF.
+	// mulWide is optional, but otherwise mulhi should be set.
+	mulWideF func(a *Asm, src1, src2, dstlo, dsthi Reg)
+
+	// addWords is a printf format taking src1, src2, dst
+	// and sets dst = WordBytes*src1+src2.
+	// It may modify the carry flag.
+	addWords string
+
+	// subCarryIsBorrow is true when the actual processor carry bit used in subtraction
+	// is really a “borrow” bit, meaning 1 means borrow and 0 means no borrow.
+	// In contrast, most systems (except x86) use a carry bit with the opposite
+	// meaning: 0 means a borrow happened, and 1 means it didn't.
+	subCarryIsBorrow bool
+
+	// Jump instruction printf formats.
+	// jmpZero and jmpNonZero are printf formats taking src, label
+	// and jump to label if src is zero / non-zero.
+	jmpZero    string
+	jmpNonZero string
+
+	// loopTop is a printf format taking src, label that should
+	// jump to label if src is zero, or else set up for a loop.
+	// If loopTop is not set, jmpZero is used.
+	loopTop string
+
+	// loopBottom is a printf format taking dst, label that should
+	// decrement dst and then jump to label if src is non-zero.
+	// If loopBottom is not set, a subtraction is used followed by
+	// use of jmpNonZero.
+	loopBottom string
+
+	// loopBottomNeg is like loopBottom but used in negative-index
+	// loops, which only happen memIndex is also set (only on 386).
+	// It increments dst instead of decrementing it.
+	loopBottomNeg string
+
+	// Indexed memory access.
+	// If set, memIndex returns a memory reference for a mov instruction
+	// addressing off(ptr)(ix*WordBytes).
+	// Using memIndex costs an extra register but allows the end-of-loop
+	// to do a single increment/decrement instead of advancing two or three pointers.
+	// This is particularly important on 386.
+	memIndex func(a *Asm, off int, ix Reg, ptr RegPtr) Reg
+
+	// Incrementing/decrementing memory access.
+	// loadIncN loads memory at ptr into regs, incrementing ptr by WordBytes after each reg.
+	// loadDecN loads memory at ptr into regs, decrementing ptr by WordBytes before each reg.
+	// storeIncN and storeDecN are the same, but storing from regs instead of loading into regs.
+	// If missing, the assembler accesses memory and advances pointers using separate instructions.
+	loadIncN  func(a *Asm, ptr RegPtr, regs []Reg)
+	loadDecN  func(a *Asm, ptr RegPtr, regs []Reg)
+	storeIncN func(a *Asm, ptr RegPtr, regs []Reg)
+	storeDecN func(a *Asm, ptr RegPtr, regs []Reg)
+
+	// options is a map from optional CPU features to functions that test for them.
+	// The test function should jump to label if the feature is available.
+	options map[Option]func(a *Asm, label string)
+}
+
+// HasShiftWide reports whether the Arch has working LshWide/RshWide instructions.
+// If not, calling them will panic.
+func (a *Arch) HasShiftWide() bool {
+	return a.lshd != ""
+}
+
+// A Hint is a hint about what a register will be used for,
+// so that an appropriate one can be selected.
+type Hint uint
+
+const (
+	HintNone       Hint = iota
+	HintShiftCount      // shift count (CX on x86)
+	HintMulSrc          // mul source operand (AX on x86)
+	HintMulHi           // wide mul high output (DX on x86)
+	HintMemOK           // a memory reference is okay
+	HintCarry           // carry flag
+	HintAltCarry        // secondary carry flag
+)
+
+// A Reg is an allocated register or other assembly operand.
+// (For example, a constant might have name "$123"
+// and a memory reference might have name "0(R8)".)
+type Reg struct{ name string }
+
+// IsImm reports whether r is an immediate value.
+func (r Reg) IsImm() bool { return strings.HasPrefix(r.name, "$") }
+
+// IsMem reports whether r is a memory value.
+func (r Reg) IsMem() bool { return strings.HasSuffix(r.name, ")") }
+
+// String returns the assembly syntax for r.
+func (r Reg) String() string { return r.name }
+
+// Valid reports whether is valid, meaning r is not the zero value of Reg (a register with no name).
+func (r Reg) Valid() bool { return r.name != "" }
+
+// A RegPtr is like a Reg but expected to hold a pointer.
+// The separate Go type helps keeps pointers and scalars separate and avoid mistakes;
+// it is okay to convert to Reg as needed to use specific routines.
+type RegPtr struct{ name string }
+
+// String returns the assembly syntax for r.
+func (r RegPtr) String() string { return r.name }
+
+// Valid reports whether is valid, meaning r is not the zero value of RegPtr (a register with no name).
+func (r RegPtr) Valid() bool { return r.name != "" }
+
+// mem returns a memory reference to off bytes from the pointer r.
+func (r *RegPtr) mem(off int) Reg { return Reg{fmt.Sprintf("%d(%s)", off, r)} }
+
+// A Carry is a flag field explaining how an instruction sets and uses the carry flags.
+// Different operations expect different sets of bits.
+// Add and Sub expect: UseCarry or 0, SetCarry, KeepCarry, or SmashCarry; and AltCarry or 0.
+// ClearCarry, SaveCarry, and ConvertCarry expect: AddCarry or SubCarry; and AltCarry or 0.
+type Carry uint
+
+const (
+	SetCarry   Carry = 1 << iota // sets carry
+	UseCarry                     // uses carry
+	KeepCarry                    // must preserve carry
+	SmashCarry                   // can modify carry or not, whatever is easiest
+
+	AltCarry // use the secondary carry flag
+	AddCarry // use add carry flag semantics (for ClearCarry, ConvertCarry)
+	SubCarry // use sub carry flag semantics (for ClearCarry, ConvertCarry)
+)
+
+// An Option denotes an optional CPU feature that can be tested at runtime.
+type Option int
+
+const (
+	_ Option = iota
+
+	// OptionAltCarry checks whether there is an add instruction
+	// that uses a secondary carry flag, so that two different sums
+	// can be accumulated in parallel with independent carry flags.
+	// Some architectures (MIPS, Loong64, RISC-V) provide this
+	// functionality natively, indicated by asm.Carry().Valid() being true.
+	OptionAltCarry
+)
diff --git a/src/math/big/internal/asmgen/arm.go b/src/math/big/internal/asmgen/arm.go
new file mode 100644
index 0000000000..eeec320838
--- /dev/null
+++ b/src/math/big/internal/asmgen/arm.go
@@ -0,0 +1,87 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+import "strings"
+
+var ArchARM = &Arch{
+	Name:          "arm",
+	WordBits:      32,
+	WordBytes:     4,
+	CarrySafeLoop: true,
+
+	regs: []string{
+		// R10 is g.
+		// R11 is the assembler/linker temporary (but we use it as a regular register).
+		// R13 is SP.
+		// R14 is LR.
+		// R15 is PC.
+		"R0", "R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", "R11", "R12",
+	},
+
+	mov:  "MOVW",
+	add:  "ADD",
+	adds: "ADD.S",
+	adc:  "ADC",
+	adcs: "ADC.S",
+	sub:  "SUB",
+	subs: "SUB.S",
+	sbc:  "SBC",
+	sbcs: "SBC.S",
+	rsb:  "RSB",
+	and:  "AND",
+	or:   "ORR",
+	xor:  "EOR",
+	lshF: armLsh,
+	rshF: armRsh,
+
+	mulWideF: armMulWide,
+
+	addWords: "ADD %s<<2, %s, %s",
+
+	jmpZero:    "TEQ $0, %s; BEQ %s",
+	jmpNonZero: "TEQ $0, %s; BNE %s",
+
+	loadIncN:  armLoadIncN,
+	loadDecN:  armLoadDecN,
+	storeIncN: armStoreIncN,
+	storeDecN: armStoreDecN,
+}
+
+func armLsh(a *Asm, shift, src, dst Reg) {
+	a.Printf("\tMOVW %s<<%s, %s\n", src, strings.TrimPrefix(shift.String(), "$"), dst)
+}
+
+func armRsh(a *Asm, shift, src, dst Reg) {
+	a.Printf("\tMOVW %s>>%s, %s\n", src, strings.TrimPrefix(shift.String(), "$"), dst)
+}
+
+func armMulWide(a *Asm, src1, src2, dstlo, dsthi Reg) {
+	a.Printf("\tMULLU %s, %s, (%s, %s)\n", src1, src2, dsthi, dstlo)
+}
+
+func armLoadIncN(a *Asm, p RegPtr, regs []Reg) {
+	for _, r := range regs {
+		a.Printf("\tMOVW.P %d(%s), %s\n", a.Arch.WordBytes, p, r)
+	}
+}
+
+func armLoadDecN(a *Asm, p RegPtr, regs []Reg) {
+	for _, r := range regs {
+		a.Printf("\tMOVW.W %d(%s), %s\n", -a.Arch.WordBytes, p, r)
+	}
+}
+
+func armStoreIncN(a *Asm, p RegPtr, regs []Reg) {
+	for _, r := range regs {
+		a.Printf("\tMOVW.P %s, %d(%s)\n", r, a.Arch.WordBytes, p)
+	}
+}
+
+func armStoreDecN(a *Asm, p RegPtr, regs []Reg) {
+	for _, r := range regs {
+		a.Printf("\tMOVW.W %s, %d(%s)\n", r, -a.Arch.WordBytes, p)
+	}
+}
diff --git a/src/math/big/internal/asmgen/asm.go b/src/math/big/internal/asmgen/asm.go
new file mode 100644
index 0000000000..cc2cfc32d1
--- /dev/null
+++ b/src/math/big/internal/asmgen/asm.go
@@ -0,0 +1,781 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+import (
+	"bytes"
+	"cmp"
+	"fmt"
+	"math/bits"
+	"slices"
+	"strings"
+)
+
+// Note: Exported fields and methods are expected to be used
+// by function generators (like the ones in add.go and so on).
+// Unexported fields and methods should not be.
+
+// An Asm is an assembly file being written.
+type Asm struct {
+	Arch     *Arch           // architecture
+	out      bytes.Buffer    // output buffer
+	regavail uint64          // bitmap of available registers
+	enabled  map[Option]bool // enabled optional CPU features
+}
+
+// NewAsm returns a new Asm preparing assembly
+// for the given architecture to be written to file.
+func NewAsm(arch *Arch) *Asm {
+	a := &Asm{Arch: arch, enabled: make(map[Option]bool)}
+	buildTag := ""
+	if arch.Build != "" {
+		buildTag = " && (" + arch.Build + ")"
+	}
+	a.Printf(asmHeader, buildTag)
+	return a
+}
+
+// Note: Using Copyright 2025, not the current year, to avoid test failures
+// on January 1 and spurious diffs when regenerating assembly.
+// The generator was written in 2025; that's good enough.
+// (As a matter of policy the Go project does not update copyright
+// notices every year, since copyright terms are so long anyway.)
+
+var asmHeader = `// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
+
+//go:build !math_big_pure_go%s
+
+#include "textflag.h"
+`
+
+// Fatalf reports a fatal error by panicking.
+// Panicking is appropriate because there is a bug in the generator,
+// and panicking will show the exact source lines leading to that bug.
+func (a *Asm) Fatalf(format string, args ...any) {
+	text := a.out.String()
+	i := strings.LastIndex(text, "\nTEXT")
+	text = text[i+1:]
+	panic("[" + a.Arch.Name + "] asmgen internal error: " + fmt.Sprintf(format, args...) + "\n" + text)
+}
+
+// hint returns the register name for the given hint.
+func (a *Asm) hint(h Hint) string {
+	if h == HintCarry && a.Arch.regCarry != "" {
+		return a.Arch.regCarry
+	}
+	if h == HintAltCarry && a.Arch.regAltCarry != "" {
+		return a.Arch.regAltCarry
+	}
+	if h == HintNone || a.Arch.hint == nil {
+		return ""
+	}
+	return a.Arch.hint(a, h)
+}
+
+// ZR returns the zero register (the specific register guaranteed to hold the integer 0),
+// or else the zero Reg (Reg{}, which has r.Valid() == false).
+func (a *Asm) ZR() Reg {
+	return Reg{a.Arch.reg0}
+}
+
+// tmp returns the temporary register, or else the zero Reg.
+// The temporary register is one available for use implementing logical instructions
+// that compile into multiple actual instructions on a given system.
+// The assembler sometimes uses it for that purpose, as do we.
+// Of course, if we are using it, we'd better not emit an instruction that
+// will cause the assembler to smash it while we want it to be holding
+// a live value. In general it is the architecture implementation's responsibility
+// not to suggest the use of any such pseudo-instructions in situations
+// where they would cause problems.
+func (a *Asm) tmp() Reg {
+	return Reg{a.Arch.regTmp}
+}
+
+// Carry returns the carry register, or else the zero Reg.
+func (a *Asm) Carry() Reg {
+	return Reg{a.Arch.regCarry}
+}
+
+// AltCarry returns the secondary carry register, or else the zero Reg.
+func (a *Asm) AltCarry() Reg {
+	return Reg{a.Arch.regAltCarry}
+}
+
+// Imm returns a Reg representing an immediate (constant) value.
+func (a *Asm) Imm(x int) Reg {
+	if x == 0 && a.Arch.reg0 != "" {
+		return Reg{a.Arch.reg0}
+	}
+	return Reg{fmt.Sprintf("$%d", x)}
+}
+
+// IsZero reports whether r is a zero immediate or the zero register.
+func (a *Asm) IsZero(r Reg) bool {
+	return r.name == "$0" || a.Arch.reg0 != "" && r.name == a.Arch.reg0
+}
+
+// Reg allocates a new register.
+func (a *Asm) Reg() Reg {
+	i := bits.TrailingZeros64(a.regavail)
+	if i == 64 {
+		a.Fatalf("out of registers")
+	}
+	a.regavail ^= 1 << i
+	return Reg{a.Arch.regs[i]}
+}
+
+// RegHint allocates a new register, with a hint as to its purpose.
+func (a *Asm) RegHint(hint Hint) Reg {
+	if name := a.hint(hint); name != "" {
+		i := slices.Index(a.Arch.regs, name)
+		if i < 0 {
+			return Reg{name}
+		}
+		if a.regavail&(1<<i) == 0 {
+			a.Fatalf("hint for already allocated register %s", name)
+		}
+		a.regavail &^= 1 << i
+		return Reg{name}
+	}
+	return a.Reg()
+}
+
+// Free frees a previously allocated register.
+// If r is not a register (if it's an immediate or a memory reference), Free is a no-op.
+func (a *Asm) Free(r Reg) {
+	i := slices.Index(a.Arch.regs, r.name)
+	if i < 0 {
+		return
+	}
+	if a.regavail&(1<<i) != 0 {
+		a.Fatalf("register %s already freed", r.name)
+	}
+	a.regavail |= 1 << i
+}
+
+// Unfree reallocates a previously freed register r.
+// If r is not a register (if it's an immediate or a memory reference), Unfree is a no-op.
+// If r is not free for allocation, Unfree panics.
+// A Free paired with Unfree can release a register for use temporarily
+// but then reclaim it, such as at the end of a loop body when it must be restored.
+func (a *Asm) Unfree(r Reg) {
+	i := slices.Index(a.Arch.regs, r.name)
+	if i < 0 {
+		return
+	}
+	if a.regavail&(1<<i) == 0 {
+		a.Fatalf("register %s not free", r.name)
+	}
+	a.regavail &^= 1 << i
+}
+
+// A RegsUsed is a snapshot of which registers are allocated.
+type RegsUsed struct {
+	avail uint64
+}
+
+// RegsUsed returns a snapshot of which registers are currently allocated,
+// which can be passed to a future call to [Asm.SetRegsUsed].
+func (a *Asm) RegsUsed() RegsUsed {
+	return RegsUsed{a.regavail}
+}
+
+// SetRegsUsed sets which registers are currently allocated.
+// The argument should have been returned from a previous
+// call to [Asm.RegsUsed].
+func (a *Asm) SetRegsUsed(used RegsUsed) {
+	a.regavail = used.avail
+}
+
+// FreeAll frees all known registers.
+func (a *Asm) FreeAll() {
+	a.regavail = 1<<len(a.Arch.regs) - 1
+}
+
+// Printf emits to the assembly output.
+func (a *Asm) Printf(format string, args ...any) {
+	text := fmt.Sprintf(format, args...)
+	if strings.Contains(text, "%!") {
+		a.Fatalf("printf error: %s", text)
+	}
+	a.out.WriteString(text)
+}
+
+// Comment emits a line comment to the assembly output.
+func (a *Asm) Comment(format string, args ...any) {
+	fmt.Fprintf(&a.out, "\t// %s\n", fmt.Sprintf(format, args...))
+}
+
+// EOL appends an end-of-line comment to the previous line.
+func (a *Asm) EOL(format string, args ...any) {
+	bytes := a.out.Bytes()
+	if len(bytes) > 0 && bytes[len(bytes)-1] == '\n' {
+		a.out.Truncate(a.out.Len() - 1)
+	}
+	a.Comment(format, args...)
+}
+
+// JmpEnable emits a test for the optional CPU feature that jumps to label if the feature is present.
+// If JmpEnable returns false, the feature is not available on this architecture and no code was emitted.
+func (a *Asm) JmpEnable(option Option, label string) bool {
+	jmpEnable := a.Arch.options[option]
+	if jmpEnable == nil {
+		return false
+	}
+	jmpEnable(a, label)
+	return true
+}
+
+// Enabled reports whether the optional CPU feature is considered
+// to be enabled at this point in the assembly output.
+func (a *Asm) Enabled(option Option) bool {
+	return a.enabled[option]
+}
+
+// SetOption changes whether the optional CPU feature should be
+// considered to be enabled.
+func (a *Asm) SetOption(option Option, on bool) {
+	a.enabled[option] = on
+}
+
+// op3 emits a 3-operand instruction op src1, src2, dst,
+// taking care to handle 2-operand machines and also
+// to simplify the printout when src2==dst.
+func (a *Asm) op3(op string, src1, src2, dst Reg) {
+	if op == "" {
+		a.Fatalf("missing instruction")
+	}
+	if src2 == dst {
+		// src2 and dst are same; print as 2-op form.
+		a.Printf("\t%s %s, %s\n", op, src1, dst)
+	} else if a.Arch.op3 != nil && !a.Arch.op3(op) {
+		// Machine does not have 3-op form for op; convert to 2-op.
+		if src1 == dst {
+			a.Fatalf("implicit mov %s, %s would smash src1", src2, dst)
+		}
+		a.Mov(src2, dst)
+		a.Printf("\t%s %s, %s\n", op, src1, dst)
+	} else {
+		// Full 3-op form.
+		a.Printf("\t%s %s, %s, %s\n", op, src1, src2, dst)
+	}
+}
+
+// Mov emits dst = src.
+func (a *Asm) Mov(src, dst Reg) {
+	if src != dst {
+		a.Printf("\t%s %s, %s\n", a.Arch.mov, src, dst)
+	}
+}
+
+// AddWords emits dst = src1*WordBytes + src2.
+// It does not set or use the carry flag.
+func (a *Asm) AddWords(src1 Reg, src2, dst RegPtr) {
+	if a.Arch.addWords == "" {
+		// Note: Assuming that Lsh does not clobber the carry flag.
+		// Architectures where this is not true (x86) need to provide Arch.addWords.
+		t := a.Reg()
+		a.Lsh(a.Imm(bits.TrailingZeros(uint(a.Arch.WordBytes))), src1, t)
+		a.Add(t, Reg(src2), Reg(dst), KeepCarry)
+		a.Free(t)
+		return
+	}
+	a.Printf("\t"+a.Arch.addWords+"\n", src1, src2, dst)
+}
+
+// And emits dst = src1 & src2
+// It may modify the carry flag.
+func (a *Asm) And(src1, src2, dst Reg) {
+	a.op3(a.Arch.and, src1, src2, dst)
+}
+
+// Or emits dst = src1 | src2
+// It may modify the carry flag.
+func (a *Asm) Or(src1, src2, dst Reg) {
+	a.op3(a.Arch.or, src1, src2, dst)
+}
+
+// Xor emits dst = src1 ^ src2
+// It may modify the carry flag.
+func (a *Asm) Xor(src1, src2, dst Reg) {
+	a.op3(a.Arch.xor, src1, src2, dst)
+}
+
+// Neg emits dst = -src.
+// It may modify the carry flag.
+func (a *Asm) Neg(src, dst Reg) {
+	if a.Arch.neg == "" {
+		if a.Arch.rsb != "" {
+			a.Printf("\t%s $0, %s, %s\n", a.Arch.rsb, src, dst)
+			return
+		}
+		if a.Arch.sub != "" && a.Arch.reg0 != "" {
+			a.Printf("\t%s %s, %s, %s\n", a.Arch.sub, src, a.Arch.reg0, dst)
+			return
+		}
+		a.Fatalf("missing neg")
+	}
+	if src == dst {
+		a.Printf("\t%s %s\n", a.Arch.neg, dst)
+	} else {
+		a.Printf("\t%s %s, %s\n", a.Arch.neg, src, dst)
+	}
+}
+
+// Lsh emits dst = src << shift.
+// It may modify the carry flag.
+func (a *Asm) Lsh(shift, src, dst Reg) {
+	if need := a.hint(HintShiftCount); need != "" && shift.name != need && !shift.IsImm() {
+		a.Fatalf("shift count not in %s", need)
+	}
+	if a.Arch.lshF != nil {
+		a.Arch.lshF(a, shift, src, dst)
+		return
+	}
+	a.op3(a.Arch.lsh, shift, src, dst)
+}
+
+// LshWide emits dst = src << shift with low bits shifted from adj.
+// It may modify the carry flag.
+func (a *Asm) LshWide(shift, adj, src, dst Reg) {
+	if a.Arch.lshd == "" {
+		a.Fatalf("no lshwide on %s", a.Arch.Name)
+	}
+	if need := a.hint(HintShiftCount); need != "" && shift.name != need && !shift.IsImm() {
+		a.Fatalf("shift count not in %s", need)
+	}
+	a.op3(fmt.Sprintf("%s %s,", a.Arch.lshd, shift), adj, src, dst)
+}
+
+// Rsh emits dst = src >> shift.
+// It may modify the carry flag.
+func (a *Asm) Rsh(shift, src, dst Reg) {
+	if need := a.hint(HintShiftCount); need != "" && shift.name != need && !shift.IsImm() {
+		a.Fatalf("shift count not in %s", need)
+	}
+	if a.Arch.rshF != nil {
+		a.Arch.rshF(a, shift, src, dst)
+		return
+	}
+	a.op3(a.Arch.rsh, shift, src, dst)
+}
+
+// RshWide emits dst = src >> shift with high bits shifted from adj.
+// It may modify the carry flag.
+func (a *Asm) RshWide(shift, adj, src, dst Reg) {
+	if a.Arch.lshd == "" {
+		a.Fatalf("no rshwide on %s", a.Arch.Name)
+	}
+	if need := a.hint(HintShiftCount); need != "" && shift.name != need && !shift.IsImm() {
+		a.Fatalf("shift count not in %s", need)
+	}
+	a.op3(fmt.Sprintf("%s %s,", a.Arch.rshd, shift), adj, src, dst)
+}
+
+// SLTU emits dst = src2 < src1 (0 or 1), using an unsigned comparison.
+func (a *Asm) SLTU(src1, src2, dst Reg) {
+	switch {
+	default:
+		a.Fatalf("arch has no sltu/sgtu")
+	case a.Arch.sltu != "":
+		a.Printf("\t%s %s, %s, %s\n", a.Arch.sltu, src1, src2, dst)
+	case a.Arch.sgtu != "":
+		a.Printf("\t%s %s, %s, %s\n", a.Arch.sgtu, src2, src1, dst)
+	}
+}
+
+// Add emits dst = src1+src2, with the specified carry behavior.
+func (a *Asm) Add(src1, src2, dst Reg, carry Carry) {
+	switch {
+	default:
+		a.Fatalf("unsupported carry behavior")
+	case a.Arch.addF != nil && a.Arch.addF(a, src1, src2, dst, carry):
+		// handled
+	case a.Arch.add != "" && (carry == KeepCarry || carry == SmashCarry):
+		a.op3(a.Arch.add, src1, src2, dst)
+	case a.Arch.adds != "" && (carry == SetCarry || carry == SmashCarry):
+		a.op3(a.Arch.adds, src1, src2, dst)
+	case a.Arch.adc != "" && (carry == UseCarry || carry == UseCarry|SmashCarry):
+		a.op3(a.Arch.adc, src1, src2, dst)
+	case a.Arch.adcs != "" && (carry == UseCarry|SetCarry || carry == UseCarry|SmashCarry):
+		a.op3(a.Arch.adcs, src1, src2, dst)
+	case a.Arch.lea != "" && (carry == KeepCarry || carry == SmashCarry):
+		if src1.IsImm() {
+			a.Printf("\t%s %s(%s), %s\n", a.Arch.lea, src1.name[1:], src2, dst) // name[1:] removes $
+		} else {
+			a.Printf("\t%s (%s)(%s), %s\n", a.Arch.lea, src1, src2, dst)
+		}
+		if src2 == dst {
+			a.EOL("ADD %s, %s", src1, dst)
+		} else {
+			a.EOL("ADD %s, %s, %s", src1, src2, dst)
+		}
+
+	case a.Arch.add != "" && a.Arch.regCarry != "":
+		// Machine has no carry flag; instead we've dedicated a register
+		// and use SLTU/SGTU (set less-than/greater-than unsigned)
+		// to compute the carry flags as needed.
+		// For ADD x, y, z, SLTU x/y, z, c computes the carry (borrow) bit.
+		// Either of x or y can be used as the second argument, provided
+		// it is not aliased to z.
+		// To make the output less of a wall of instructions,
+		// we comment the “higher-level” operation, with ... marking
+		// continued instructions implementing the operation.
+		cr := a.Carry()
+		if carry&AltCarry != 0 {
+			cr = a.AltCarry()
+			if !cr.Valid() {
+				a.Fatalf("alt carry not supported")
+			}
+			carry &^= AltCarry
+		}
+		tmp := a.tmp()
+		if !tmp.Valid() {
+			a.Fatalf("cannot simulate sub carry without regTmp")
+		}
+		switch carry {
+		default:
+			a.Fatalf("unsupported carry behavior")
+		case UseCarry, UseCarry | SmashCarry:
+			// Easy case, just add the carry afterward.
+			if a.IsZero(src1) {
+				// Only here to use the carry.
+				a.Add(cr, src2, dst, KeepCarry)
+				a.EOL("ADC $0, %s, %s", src2, dst)
+				break
+			}
+			a.Add(src1, src2, dst, KeepCarry)
+			a.EOL("ADC %s, %s, %s (cr=%s)", src1, src2, dst, cr)
+			a.Add(cr, dst, dst, KeepCarry)
+			a.EOL("...")
+
+		case SetCarry:
+			if a.IsZero(src1) && src2 == dst {
+				// Only here to clear the carry flag. (Caller will comment.)
+				a.Xor(cr, cr, cr)
+				break
+			}
+			var old Reg // old is a src distinct from dst
+			switch {
+			case dst != src1:
+				old = src1
+			case dst != src2:
+				old = src2
+			default:
+				// src1 == src2 == dst.
+				// Overflows if and only if the high bit is set, so copy high bit to carry.
+				a.Rsh(a.Imm(a.Arch.WordBits-1), src1, cr)
+				a.EOL("ADDS %s, %s, %s (cr=%s)", src1, src2, dst, cr)
+				a.Add(src1, src2, dst, KeepCarry)
+				a.EOL("...")
+				return
+			}
+			a.Add(src1, src2, dst, KeepCarry)
+			a.EOL("ADDS %s, %s, %s (cr=%s)", src1, src2, dst, cr)
+			a.SLTU(old, dst, cr) // dst < old (one of the src) implies carry
+			a.EOL("...")
+
+		case UseCarry | SetCarry:
+			if a.IsZero(src1) {
+				// Only here to use and then set the carry.
+				// Easy since carry is not aliased to dst.
+				a.Add(cr, src2, dst, KeepCarry)
+				a.EOL("ADCS $0, %s, %s (cr=%s)", src2, dst, cr)
+				a.SLTU(cr, dst, cr) // dst < cr implies carry
+				a.EOL("...")
+				break
+			}
+			// General case. Need to do two different adds (src1 + src2 + cr),
+			// computing carry bits for both, and add'ing them together.
+			// Start with src1+src2.
+			var old Reg // old is a src distinct from dst
+			switch {
+			case dst != src1:
+				old = src1
+			case dst != src2:
+				old = src2
+			}
+			if old.Valid() {
+				a.Add(src1, src2, dst, KeepCarry)
+				a.EOL("ADCS %s, %s, %s (cr=%s)", src1, src2, dst, cr)
+				a.SLTU(old, dst, tmp) // // dst < old (one of the src) implies carry
+				a.EOL("...")
+			} else {
+				// src1 == src2 == dst, like above. Sign bit is carry bit,
+				// but we copy it into tmp, not cr.
+				a.Rsh(a.Imm(a.Arch.WordBits-1), src1, tmp)
+				a.EOL("ADCS %s, %s, %s (cr=%s)", src1, src2, dst, cr)
+				a.Add(src1, src2, dst, KeepCarry)
+				a.EOL("...")
+			}
+			// Add cr to dst.
+			a.Add(cr, dst, dst, KeepCarry)
+			a.EOL("...")
+			a.SLTU(cr, dst, cr) // sum < cr implies carry
+			a.EOL("...")
+			// Add the two carry bits (at most one can be set, because (2⁶⁴-1)+(2⁶⁴-1)+1 < 2·2⁶⁴).
+			a.Add(tmp, cr, cr, KeepCarry)
+			a.EOL("...")
+		}
+	}
+}
+
+// Sub emits dst = src2-src1, with the specified carry behavior.
+func (a *Asm) Sub(src1, src2, dst Reg, carry Carry) {
+	switch {
+	default:
+		a.Fatalf("unsupported carry behavior")
+	case a.Arch.subF != nil && a.Arch.subF(a, src1, src2, dst, carry):
+		// handled
+	case a.Arch.sub != "" && (carry == KeepCarry || carry == SmashCarry):
+		a.op3(a.Arch.sub, src1, src2, dst)
+	case a.Arch.subs != "" && (carry == SetCarry || carry == SmashCarry):
+		a.op3(a.Arch.subs, src1, src2, dst)
+	case a.Arch.sbc != "" && (carry == UseCarry || carry == UseCarry|SmashCarry):
+		a.op3(a.Arch.sbc, src1, src2, dst)
+	case a.Arch.sbcs != "" && (carry == UseCarry|SetCarry || carry == UseCarry|SmashCarry):
+		a.op3(a.Arch.sbcs, src1, src2, dst)
+	case strings.HasPrefix(src1.name, "$") && (carry == KeepCarry || carry == SmashCarry):
+		// Running out of options; if this is an immediate
+		// and we don't need to worry about carry semantics,
+		// try adding the negation.
+		if strings.HasPrefix(src1.name, "$-") {
+			src1.name = "$" + src1.name[2:]
+		} else {
+			src1.name = "$-" + src1.name[1:]
+		}
+		a.Add(src1, src2, dst, carry)
+
+	case a.Arch.sub != "" && a.Arch.regCarry != "":
+		// Machine has no carry flag; instead we've dedicated a register
+		// and use SLTU/SGTU (set less-than/greater-than unsigned)
+		// to compute the carry bits as needed.
+		// For SUB x, y, z, SLTU x, y, c computes the carry (borrow) bit.
+		// To make the output less of a wall of instructions,
+		// we comment the “higher-level” operation, with ... marking
+		// continued instructions implementing the operation.
+		// Be careful! Subtract and add have different overflow behaviors,
+		// so the details here are NOT the same as in Add above.
+		cr := a.Carry()
+		if carry&AltCarry != 0 {
+			a.Fatalf("alt carry not supported")
+		}
+		tmp := a.tmp()
+		if !tmp.Valid() {
+			a.Fatalf("cannot simulate carry without regTmp")
+		}
+		switch carry {
+		default:
+			a.Fatalf("unsupported carry behavior")
+		case UseCarry, UseCarry | SmashCarry:
+			// Easy case, just subtract the carry afterward.
+			if a.IsZero(src1) {
+				// Only here to use the carry.
+				a.Sub(cr, src2, dst, KeepCarry)
+				a.EOL("SBC $0, %s, %s", src2, dst)
+				break
+			}
+			a.Sub(src1, src2, dst, KeepCarry)
+			a.EOL("SBC %s, %s, %s", src1, src2, dst)
+			a.Sub(cr, dst, dst, KeepCarry)
+			a.EOL("...")
+
+		case SetCarry:
+			if a.IsZero(src1) && src2 == dst {
+				// Only here to clear the carry flag.
+				a.Xor(cr, cr, cr)
+				break
+			}
+			// Compute the new carry first, in case dst is src1 or src2.
+			a.SLTU(src1, src2, cr)
+			a.EOL("SUBS %s, %s, %s", src1, src2, dst)
+			a.Sub(src1, src2, dst, KeepCarry)
+			a.EOL("...")
+
+		case UseCarry | SetCarry:
+			if a.IsZero(src1) {
+				// Only here to use and then set the carry.
+				if src2 == dst {
+					// Unfortunate case. Using src2==dst is common (think x -= y)
+					// and also more efficient on two-operand machines (like x86),
+					// but here subtracting from dst will smash src2, making it
+					// impossible to recover the carry information after the SUB.
+					// But we want to use the carry, so we can't compute it before
+					// the SUB either. Compute into a temporary and MOV.
+					a.SLTU(cr, src2, tmp)
+					a.EOL("SBCS $0, %s, %s", src2, dst)
+					a.Sub(cr, src2, dst, KeepCarry)
+					a.EOL("...")
+					a.Mov(tmp, cr)
+					a.EOL("...")
+					break
+				}
+				a.Sub(cr, src2, dst, KeepCarry) // src2 not dst, so src2 preserved
+				a.SLTU(cr, src2, cr)
+				break
+			}
+			// General case. Need to do two different subtracts (src2 - cr - src1),
+			// computing carry bits for both, and add'ing them together.
+			// Doing src2 - cr first frees up cr to store the carry from the sub of src1.
+			a.SLTU(cr, src2, tmp)
+			a.EOL("SBCS %s, %s, %s", src1, src2, dst)
+			a.Sub(cr, src2, dst, KeepCarry)
+			a.EOL("...")
+			a.SLTU(src1, dst, cr)
+			a.EOL("...")
+			a.Sub(src1, dst, dst, KeepCarry)
+			a.EOL("...")
+			a.Add(tmp, cr, cr, KeepCarry)
+			a.EOL("...")
+		}
+	}
+}
+
+// ClearCarry clears the carry flag.
+// The ‘which’ parameter must be AddCarry or SubCarry to specify how the flag will be used.
+// (On some systems, the sub carry's actual processor bit is inverted from its usual value.)
+func (a *Asm) ClearCarry(which Carry) {
+	dst := Reg{a.Arch.regs[0]} // not actually modified
+	switch which & (AddCarry | SubCarry) {
+	default:
+		a.Fatalf("bad carry")
+	case AddCarry:
+		a.Add(a.Imm(0), dst, dst, SetCarry|which&AltCarry)
+	case SubCarry:
+		a.Sub(a.Imm(0), dst, dst, SetCarry|which&AltCarry)
+	}
+	a.EOL("clear carry")
+}
+
+// SaveCarry saves the carry flag into dst.
+// The meaning of the bits in dst is architecture-dependent.
+// The carry flag is left in an undefined state.
+func (a *Asm) SaveCarry(dst Reg) {
+	// Note: As implemented here, the carry flag is actually left unmodified,
+	// but we say it is in an undefined state in case that changes in the future.
+	// (The SmashCarry could be changed to SetCarry if so.)
+	if cr := a.Carry(); cr.Valid() {
+		if cr == dst {
+			return // avoid EOL
+		}
+		a.Mov(cr, dst)
+	} else {
+		a.Sub(dst, dst, dst, UseCarry|SmashCarry)
+	}
+	a.EOL("save carry")
+}
+
+// RestoreCarry restores the carry flag from src.
+// src is left in an undefined state.
+func (a *Asm) RestoreCarry(src Reg) {
+	if cr := a.Carry(); cr.Valid() {
+		if cr == src {
+			return // avoid EOL
+		}
+		a.Mov(src, cr)
+	} else if a.Arch.subCarryIsBorrow {
+		a.Add(src, src, src, SetCarry)
+	} else {
+		// SaveCarry saved the sub carry flag with an encoding of 0, 1 -> 0, ^0.
+		// Restore it by subtracting from a value less than ^0, which will carry if src != 0.
+		// If there is no zero register, the SP register is guaranteed to be less than ^0.
+		// (This may seem too clever, but on GOARCH=arm we have no other good options.)
+		a.Sub(src, cmp.Or(a.ZR(), Reg{"SP"}), src, SetCarry)
+	}
+	a.EOL("restore carry")
+}
+
+// ConvertCarry converts the carry flag in dst from the internal format to a 0 or 1.
+// The carry flag is left in an undefined state.
+func (a *Asm) ConvertCarry(which Carry, dst Reg) {
+	if a.Carry().Valid() { // already 0 or 1
+		return
+	}
+	switch which {
+	case AddCarry:
+		if a.Arch.subCarryIsBorrow {
+			a.Neg(dst, dst)
+		} else {
+			a.Add(a.Imm(1), dst, dst, SmashCarry)
+		}
+		a.EOL("convert add carry")
+	case SubCarry:
+		a.Neg(dst, dst)
+		a.EOL("convert sub carry")
+	}
+}
+
+// SaveConvertCarry saves and converts the carry flag into dst: 0 unset, 1 set.
+// The carry flag is left in an undefined state.
+func (a *Asm) SaveConvertCarry(which Carry, dst Reg) {
+	switch which {
+	default:
+		a.Fatalf("bad carry")
+	case AddCarry:
+		if (a.Arch.adc != "" || a.Arch.adcs != "") && a.ZR().Valid() {
+			a.Add(a.ZR(), a.ZR(), dst, UseCarry|SmashCarry)
+			a.EOL("save & convert add carry")
+			return
+		}
+	case SubCarry:
+		// no special cases
+	}
+	a.SaveCarry(dst)
+	a.ConvertCarry(which, dst)
+}
+
+// MulWide emits dstlo = src1 * src2 and dsthi = (src1 * src2) >> WordBits.
+// The carry flag is left in an undefined state.
+// If dstlo or dsthi is the zero Reg, then those outputs are discarded.
+func (a *Asm) MulWide(src1, src2, dstlo, dsthi Reg) {
+	switch {
+	default:
+		a.Fatalf("mulwide not available")
+	case a.Arch.mulWideF != nil:
+		a.Arch.mulWideF(a, src1, src2, dstlo, dsthi)
+	case a.Arch.mul != "" && !dsthi.Valid():
+		a.op3(a.Arch.mul, src1, src2, dstlo)
+	case a.Arch.mulhi != "" && !dstlo.Valid():
+		a.op3(a.Arch.mulhi, src1, src2, dsthi)
+	case a.Arch.mul != "" && a.Arch.mulhi != "" && dstlo != src1 && dstlo != src2:
+		a.op3(a.Arch.mul, src1, src2, dstlo)
+		a.op3(a.Arch.mulhi, src1, src2, dsthi)
+	case a.Arch.mul != "" && a.Arch.mulhi != "" && dsthi != src1 && dsthi != src2:
+		a.op3(a.Arch.mulhi, src1, src2, dsthi)
+		a.op3(a.Arch.mul, src1, src2, dstlo)
+	}
+}
+
+// Jmp jumps to the label.
+func (a *Asm) Jmp(label string) {
+	// Note: Some systems prefer the spelling B or BR, but all accept JMP.
+	a.Printf("\tJMP %s\n", label)
+}
+
+// JmpZero jumps to the label if src is zero.
+// It may modify the carry flag unless a.Arch.CarrySafeLoop is true.
+func (a *Asm) JmpZero(src Reg, label string) {
+	a.Printf("\t"+a.Arch.jmpZero+"\n", src, label)
+}
+
+// JmpNonZero jumps to the label if src is non-zero.
+// It may modify the carry flag unless a.Arch,CarrySafeLoop is true.
+func (a *Asm) JmpNonZero(src Reg, label string) {
+	a.Printf("\t"+a.Arch.jmpNonZero+"\n", src, label)
+}
+
+// Label emits a label with the given name.
+func (a *Asm) Label(name string) {
+	a.Printf("%s:\n", name)
+}
+
+// Ret returns.
+func (a *Asm) Ret() {
+	a.Printf("\tRET\n")
+}
diff --git a/src/math/big/internal/asmgen/func.go b/src/math/big/internal/asmgen/func.go
new file mode 100644
index 0000000000..8a762febce
--- /dev/null
+++ b/src/math/big/internal/asmgen/func.go
@@ -0,0 +1,138 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+import (
+	"fmt"
+	"slices"
+	"strings"
+)
+
+// Note: Exported fields and methods are expected to be used
+// by function generators (like the ones in add.go and so on).
+// Unexported fields and methods should not be.
+
+// A Func represents a single assembly function.
+type Func struct {
+	Name    string
+	Asm     *Asm
+	inputs  []string       // name of input slices (not beginning with z)
+	outputs []string       // names of output slices (beginning with z)
+	args    map[string]int // offsets of args, results on stack
+}
+
+// Func starts a new function in the assembly output.
+func (a *Asm) Func(decl string) *Func {
+	d, ok := strings.CutPrefix(decl, "func ")
+	if !ok {
+		a.Fatalf("func decl does not begin with 'func '")
+	}
+	name, d, ok := strings.Cut(d, "(")
+	if !ok {
+		a.Fatalf("func decl does not have func arg list")
+	}
+	f := &Func{
+		Name: name,
+		Asm:  a,
+		args: make(map[string]int),
+	}
+	a.FreeAll()
+
+	// Parse argument names and types. Quick and dirty.
+	// Convert (args) (results) into args, results.
+	d = strings.ReplaceAll(d, ") (", ", ")
+	d = strings.TrimSuffix(d, ")")
+	args := strings.Split(d, ",")
+
+	// Assign implicit types to all arguments (x, y int -> x int, y int).
+	typ := ""
+	for i, arg := range slices.Backward(args) {
+		arg = strings.TrimSpace(arg)
+		if !strings.Contains(arg, " ") {
+			if typ == "" {
+				a.Fatalf("missing argument type")
+			}
+			arg += " " + typ
+		} else {
+			_, typ, _ = strings.Cut(arg, " ")
+		}
+		args[i] = arg
+	}
+
+	// Record mapping from names to offsets.
+	off := 0
+	for _, arg := range args {
+		name, typ, _ := strings.Cut(arg, " ")
+		switch typ {
+		default:
+			a.Fatalf("unknown type %s", typ)
+		case "Word", "uint", "int":
+			f.args[name] = off
+			off += a.Arch.WordBytes
+		case "[]Word":
+			if strings.HasPrefix(name, "z") {
+				f.outputs = append(f.outputs, name)
+			} else {
+				f.inputs = append(f.inputs, name)
+			}
+			f.args[name+"_base"] = off
+			f.args[name+"_len"] = off + a.Arch.WordBytes
+			f.args[name+"_cap"] = off + 2*a.Arch.WordBytes
+			off += 3 * a.Arch.WordBytes
+		}
+	}
+
+	a.Printf("\n")
+	a.Printf("// %s\n", decl)
+	a.Printf("TEXT ·%s(SB), NOSPLIT, $0\n", name)
+	if a.Arch.setup != nil {
+		a.Arch.setup(f)
+	}
+	return f
+}
+
+// Arg allocates a new register, copies the named argument (or result) into it,
+// and returns that register.
+func (f *Func) Arg(name string) Reg {
+	return f.ArgHint(name, HintNone)
+}
+
+// ArgHint is like Arg but uses a register allocation hint.
+func (f *Func) ArgHint(name string, hint Hint) Reg {
+	off, ok := f.args[name]
+	if !ok {
+		f.Asm.Fatalf("unknown argument %s", name)
+	}
+	mem := Reg{fmt.Sprintf("%s+%d(FP)", name, off)}
+	if hint == HintMemOK && f.Asm.Arch.memOK {
+		return mem
+	}
+	r := f.Asm.RegHint(hint)
+	f.Asm.Mov(mem, r)
+	return r
+}
+
+// ArgPtr is like Arg but returns a RegPtr.
+func (f *Func) ArgPtr(name string) RegPtr {
+	return RegPtr(f.Arg(name))
+}
+
+// StoreArg stores src into the named argument (or result).
+func (f *Func) StoreArg(src Reg, name string) {
+	off, ok := f.args[name]
+	if !ok {
+		f.Asm.Fatalf("unknown argument %s", name)
+	}
+	a := f.Asm
+	mem := Reg{fmt.Sprintf("%s+%d(FP)", name, off)}
+	if src.IsImm() && !a.Arch.memOK {
+		r := a.Reg()
+		a.Mov(src, r)
+		a.Mov(r, mem)
+		a.Free(r)
+		return
+	}
+	a.Mov(src, mem)
+}
diff --git a/src/math/big/internal/asmgen/main.go b/src/math/big/internal/asmgen/main.go
new file mode 100644
index 0000000000..0214a91b1c
--- /dev/null
+++ b/src/math/big/internal/asmgen/main.go
@@ -0,0 +1,30 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Asmgen generates math/big assembly.
+//
+// Usage:
+//
+//	cd go/src/math/big
+//	go test ./internal/asmgen -generate
+//
+// Or:
+//
+//	go generate math/big
+package asmgen
+
+var arches = []*Arch{
+	ArchARM,
+	ArchMIPS,
+	ArchMIPS64x,
+}
+
+// generate returns the file name and content of the generated assembly for the given architecture.
+func generate(arch *Arch) (file string, data []byte) {
+	file = "arith_" + arch.Name + ".s"
+	a := NewAsm(arch)
+	addOrSubVV(a, "addVV")
+	addOrSubVV(a, "subVV")
+	return file, a.out.Bytes()
+}
diff --git a/src/math/big/internal/asmgen/main_test.go b/src/math/big/internal/asmgen/main_test.go
new file mode 100644
index 0000000000..ab203d31b9
--- /dev/null
+++ b/src/math/big/internal/asmgen/main_test.go
@@ -0,0 +1,38 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+import (
+	"bytes"
+	"flag"
+	"internal/diff"
+	"os"
+	"testing"
+)
+
+var generateFlag = flag.Bool("generate", false, "generate files")
+
+func Test(t *testing.T) {
+	t.Skip("assembly not yet installed")
+	for _, arch := range arches {
+		t.Run(arch.Name, func(t *testing.T) {
+			file, data := generate(arch)
+			old, err := os.ReadFile("../../" + file)
+			if err == nil && bytes.Equal(old, data) {
+				return
+			}
+			if *generateFlag {
+				if err := os.WriteFile("../../"+file, data, 0o666); err != nil {
+					t.Fatal(err)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			t.Fatalf("generated assembly differs:\n%s\n", diff.Diff("../../"+file, old, "regenerated", data))
+		})
+	}
+}
diff --git a/src/math/big/internal/asmgen/mips.go b/src/math/big/internal/asmgen/mips.go
new file mode 100644
index 0000000000..e7079468a6
--- /dev/null
+++ b/src/math/big/internal/asmgen/mips.go
@@ -0,0 +1,48 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+var ArchMIPS = &Arch{
+	Name:          "mipsx",
+	Build:         "mips || mipsle",
+	WordBits:      32,
+	WordBytes:     4,
+	CarrySafeLoop: true,
+
+	regs: []string{
+		// R0 is 0
+		// R23 is the assembler/linker temporary (which we use too).
+		// R26 and R27 are our virtual carry flags.
+		// R28 is SB.
+		// R29 is SP.
+		// R30 is g.
+		// R31 is LR.
+		"R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9",
+		"R10", "R11", "R12", "R13", "R14", "R15", "R16", "R17", "R18", "R19",
+		"R20", "R21", "R22", "R24", "R25", "R26", "R27",
+	},
+	reg0:        "R0",
+	regTmp:      "R23",
+	regCarry:    "R26",
+	regAltCarry: "R27",
+
+	mov:      "MOVW",
+	add:      "ADDU",
+	sltu:     "SGTU", // SGTU args are swapped, so it's really SLTU
+	sub:      "SUBU",
+	mulWideF: mipsMulWide,
+	lsh:      "SLL",
+	rsh:      "SRL",
+	and:      "AND",
+	or:       "OR",
+	xor:      "XOR",
+
+	jmpZero:    "BEQ %s, %s",
+	jmpNonZero: "BNE %s, %s",
+}
+
+func mipsMulWide(a *Asm, src1, src2, dstlo, dsthi Reg) {
+	a.Printf("\tMULU %s, %s\n\tMOVW LO, %s\n\tMOVW HI, %s\n", src1, src2, dstlo, dsthi)
+}
diff --git a/src/math/big/internal/asmgen/mips64.go b/src/math/big/internal/asmgen/mips64.go
new file mode 100644
index 0000000000..b70239864a
--- /dev/null
+++ b/src/math/big/internal/asmgen/mips64.go
@@ -0,0 +1,48 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+var ArchMIPS64x = &Arch{
+	Name:          "mips64x",
+	Build:         "mips64 || mips64le",
+	WordBits:      64,
+	WordBytes:     8,
+	CarrySafeLoop: true,
+
+	regs: []string{
+		// R0 is 0
+		// R23 is the assembler/linker temporary (which we use too).
+		// R26 and R27 are our virtual carry flags.
+		// R28 is SB.
+		// R29 is SP.
+		// R30 is g.
+		// R31 is LR.
+		"R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9",
+		"R10", "R11", "R12", "R13", "R14", "R15", "R16", "R17", "R18", "R19",
+		"R20", "R21", "R22", "R24", "R25", "R26", "R27",
+	},
+	reg0:        "R0",
+	regTmp:      "R23",
+	regCarry:    "R26",
+	regAltCarry: "R27",
+
+	mov:      "MOVV",
+	add:      "ADDVU",
+	sltu:     "SGTU", // SGTU args are swapped, so it's really SLTU
+	sub:      "SUBVU",
+	mulWideF: mips64MulWide,
+	lsh:      "SLLV",
+	rsh:      "SRLV",
+	and:      "AND",
+	or:       "OR",
+	xor:      "XOR",
+
+	jmpZero:    "BEQ %s, %s",
+	jmpNonZero: "BNE %s, %s",
+}
+
+func mips64MulWide(a *Asm, src1, src2, dstlo, dsthi Reg) {
+	a.Printf("\tMULVU %s, %s\n\tMOVV LO, %s\n\tMOVV HI, %s\n", src1, src2, dstlo, dsthi)
+}
diff --git a/src/math/big/internal/asmgen/pipe.go b/src/math/big/internal/asmgen/pipe.go
new file mode 100644
index 0000000000..743e15f3f8
--- /dev/null
+++ b/src/math/big/internal/asmgen/pipe.go
@@ -0,0 +1,569 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package asmgen
+
+import (
+	"fmt"
+	"math/bits"
+	"slices"
+)
+
+// Note: Exported fields and methods are expected to be used
+// by function generators (like the ones in add.go and so on).
+// Unexported fields and methods should not be.
+
+// A Pipe manages the input and output data pipelines for a function's
+// memory operations.
+//
+// The input is one or more equal-length slices of words, so collectively
+// it can be viewed as a matrix, in which each slice is a row and each column
+// is a set of corresponding words from the different slices.
+// The output can be viewed the same way, although it is often just one row.
+type Pipe struct {
+	f               *Func    // function being generated
+	label           string   // prefix for loop labels (default "loop")
+	backward        bool     // processing columns in reverse
+	started         bool     // Start has been called
+	loaded          bool     // LoadPtrs has been called
+	inPtr           []RegPtr // input slice pointers
+	hints           []Hint   // for each inPtr, a register hint to use for its data
+	outPtr          []RegPtr // output slice pointers
+	index           Reg      // index register, if in use
+	useIndexCounter bool     // index counter requested
+	indexCounter    int      // index is also counter (386); 0 no, -1 negative counter, +1 positive counter
+	readOff         int      // read offset not yet added to index
+	writeOff        int      // write offset not yet added to index
+	factors         []int    // unrolling factors
+	counts          []Reg    // iterations for each factor
+	needWrite       bool     // need a write call during Loop1/LoopN
+	maxColumns      int      // maximum columns during unrolled loop
+	unrollStart     func()   // emit code at start of unrolled body
+	unrollEnd       func()   // emit code end of unrolled body
+}
+
+// Pipe creates and returns a new pipe for use in the function f.
+func (f *Func) Pipe() *Pipe {
+	a := f.Asm
+	p := &Pipe{
+		f:          f,
+		label:      "loop",
+		maxColumns: 10000000,
+	}
+	if m := a.Arch.maxColumns; m != 0 {
+		p.maxColumns = m
+	}
+	return p
+}
+
+// SetBackward sets the pipe to process the input and output columns in reverse order.
+// This is needed for left shifts, which might otherwise overwrite data they will read later.
+func (p *Pipe) SetBackward() {
+	if p.loaded {
+		p.f.Asm.Fatalf("SetBackward after Start/LoadPtrs")
+	}
+	p.backward = true
+}
+
+// SetUseIndexCounter sets the pipe to use an index counter if possible,
+// meaning the loop counter is also used as an index for accessing the slice data.
+// This clever trick is slower on modern processors, but it is still necessary on 386.
+// On non-386 systems, SetUseIndexCounter is a no-op.
+func (p *Pipe) SetUseIndexCounter() {
+	if p.f.Asm.Arch.memIndex == nil { // need memIndex (only 386 provides it)
+		return
+	}
+	p.useIndexCounter = true
+}
+
+// SetLabel sets the label prefix for the loops emitted by the pipe.
+// The default prefix is "loop".
+func (p *Pipe) SetLabel(label string) {
+	p.label = label
+}
+
+// SetMaxColumns sets the maximum number of
+// columns processed in a single loop body call.
+func (p *Pipe) SetMaxColumns(m int) {
+	p.maxColumns = m
+}
+
+// SetHint records that the inputs from the named vector
+// should be allocated with the given register hint.
+//
+// If the hint indicates a single register on the target architecture,
+// then SetHint calls SetMaxColumns(1), since the hinted register
+// can only be used for one value at a time.
+func (p *Pipe) SetHint(name string, hint Hint) {
+	if hint == HintMemOK && !p.f.Asm.Arch.memOK {
+		return
+	}
+	i := slices.Index(p.f.inputs, name)
+	if i < 0 {
+		p.f.Asm.Fatalf("unknown input name %s", name)
+	}
+	if p.f.Asm.hint(hint) != "" {
+		p.SetMaxColumns(1)
+	}
+	for len(p.hints) <= i {
+		p.hints = append(p.hints, HintNone)
+	}
+	p.hints[i] = hint
+}
+
+// LoadPtrs loads the slice pointer arguments into registers,
+// assuming that the slice length n has already been loaded
+// into the register n.
+//
+// Start will call LoadPtrs if it has not been called already.
+// LoadPtrs only needs to be called explicitly when code needs
+// to use LoadN before Start, like when the shift.go generators
+// read an initial word before the loop.
+func (p *Pipe) LoadPtrs(n Reg) {
+	a := p.f.Asm
+	if p.loaded {
+		a.Fatalf("pointers already loaded")
+	}
+
+	// Load the actual pointers.
+	p.loaded = true
+	for _, name := range p.f.inputs {
+		p.inPtr = append(p.inPtr, RegPtr(p.f.Arg(name+"_base")))
+	}
+	for _, name := range p.f.outputs {
+		p.outPtr = append(p.outPtr, RegPtr(p.f.Arg(name+"_base")))
+	}
+
+	// Decide the memory access strategy for LoadN and StoreN.
+	switch {
+	case p.backward && p.useIndexCounter:
+		// Generator wants an index counter, meaning when the iteration counter
+		// is AX, we will access the slice with pointer BX using (BX)(AX*WordBytes).
+		// The loop is moving backward through the slice, but the counter
+		// is also moving backward, so not much to do.
+		a.Comment("run loop backward, using counter as positive index")
+		p.indexCounter = +1
+		p.index = n
+
+	case !p.backward && p.useIndexCounter:
+		// Generator wants an index counter, but the loop is moving forward.
+		// To make the counter move in the direction of data access,
+		// we negate the counter, counting up from -len(z) to -1.
+		// To make the index access the right words, we add len(z)*WordBytes
+		// to each of the pointers.
+		// See comment below about the garbage collector (non-)implications
+		// of pointing beyond the slice bounds.
+		a.Comment("use counter as negative index")
+		p.indexCounter = -1
+		p.index = n
+		for _, ptr := range p.inPtr {
+			a.AddWords(n, ptr, ptr)
+		}
+		for _, ptr := range p.outPtr {
+			a.AddWords(n, ptr, ptr)
+		}
+		a.Neg(n, n)
+
+	case p.backward:
+		// Generator wants to run the loop backward.
+		// We'll decrement the pointers before using them,
+		// so position them at the very end of the slices.
+		// If we had precise pointer information for assembly,
+		// these pointers would cause problems with the garbage collector,
+		// since they no longer point into the allocated slice,
+		// but the garbage collector ignores unexpected values in assembly stacks,
+		// and the actual slice pointers are still in the argument stack slots,
+		// so the slices won't be collected early.
+		// If we switched to the register ABI, we might have to rethink this.
+		// (The same thing happens by the end of forward loops,
+		// but it's less important since once the pointers go off the slice
+		// in a forward loop, the loop is over and the slice won't be accessed anymore.)
+		a.Comment("run loop backward")
+		for _, ptr := range p.inPtr {
+			a.AddWords(n, ptr, ptr)
+		}
+		for _, ptr := range p.outPtr {
+			a.AddWords(n, ptr, ptr)
+		}
+
+	case !p.backward:
+		// Nothing to do!
+	}
+}
+
+// LoadN returns the next n columns of input words as a slice of rows.
+// Regs for inputs that have been marked using p.SetMemOK will be direct memory references.
+// Regs for other inputs will be newly allocated registers and must be freed.
+func (p *Pipe) LoadN(n int) [][]Reg {
+	a := p.f.Asm
+	regs := make([][]Reg, len(p.inPtr))
+	for i, ptr := range p.inPtr {
+		regs[i] = make([]Reg, n)
+		switch {
+		case a.Arch.loadIncN != nil:
+			// Load from memory and advance pointers at the same time.
+			for j := range regs[i] {
+				regs[i][j] = p.f.Asm.Reg()
+			}
+			if p.backward {
+				a.Arch.loadDecN(a, ptr, regs[i])
+			} else {
+				a.Arch.loadIncN(a, ptr, regs[i])
+			}
+
+		default:
+			// Load from memory using offsets.
+			// We'll advance the pointers or the index counter later.
+			for j := range n {
+				off := p.readOff + j
+				if p.backward {
+					off = -(off + 1)
+				}
+				var mem Reg
+				if p.indexCounter != 0 {
+					mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr)
+				} else {
+					mem = ptr.mem(off * a.Arch.WordBytes)
+				}
+				h := HintNone
+				if i < len(p.hints) {
+					h = p.hints[i]
+				}
+				if h == HintMemOK {
+					regs[i][j] = mem
+				} else {
+					r := p.f.Asm.RegHint(h)
+					a.Mov(mem, r)
+					regs[i][j] = r
+				}
+			}
+		}
+	}
+	p.readOff += n
+	return regs
+}
+
+// StoreN writes regs (a slice of rows) to the next n columns of output, where n = len(regs[0]).
+func (p *Pipe) StoreN(regs [][]Reg) {
+	p.needWrite = false
+	a := p.f.Asm
+	if len(regs) != len(p.outPtr) {
+		p.f.Asm.Fatalf("wrong number of output rows")
+	}
+	n := len(regs[0])
+	for i, ptr := range p.outPtr {
+		switch {
+		case a.Arch.storeIncN != nil:
+			// Store to memory and advance pointers at the same time.
+			if p.backward {
+				a.Arch.storeDecN(a, ptr, regs[i])
+			} else {
+				a.Arch.storeIncN(a, ptr, regs[i])
+			}
+
+		default:
+			// Store to memory using offsets.
+			// We'll advance the pointers or the index counter later.
+			for j, r := range regs[i] {
+				off := p.writeOff + j
+				if p.backward {
+					off = -(off + 1)
+				}
+				var mem Reg
+				if p.indexCounter != 0 {
+					mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr)
+				} else {
+					mem = ptr.mem(off * a.Arch.WordBytes)
+				}
+				a.Mov(r, mem)
+			}
+		}
+	}
+	p.writeOff += n
+}
+
+// advancePtrs advances the pointers by step
+// or handles bookkeeping for an imminent index advance by step
+// that the caller will do.
+func (p *Pipe) advancePtrs(step int) {
+	a := p.f.Asm
+	switch {
+	case a.Arch.loadIncN != nil:
+		// nothing to do
+
+	default:
+		// Adjust read/write offsets for pointer advance (or imminent index advance).
+		p.readOff -= step
+		p.writeOff -= step
+
+		if p.indexCounter == 0 {
+			// Advance pointers.
+			if p.backward {
+				step = -step
+			}
+			for _, ptr := range p.inPtr {
+				a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry)
+			}
+			for _, ptr := range p.outPtr {
+				a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry)
+			}
+		}
+	}
+}
+
+// DropInput deletes the named input from the pipe,
+// usually because it has been exhausted.
+// (This is not used yet but will be used in a future generator.)
+func (p *Pipe) DropInput(name string) {
+	i := slices.Index(p.f.inputs, name)
+	if i < 0 {
+		p.f.Asm.Fatalf("unknown input %s", name)
+	}
+	ptr := p.inPtr[i]
+	p.f.Asm.Free(Reg(ptr))
+	p.inPtr = slices.Delete(p.inPtr, i, i+1)
+	p.f.inputs = slices.Delete(p.f.inputs, i, i+1)
+	if len(p.hints) > i {
+		p.hints = slices.Delete(p.hints, i, i+1)
+	}
+}
+
+// Start prepares to loop over n columns.
+// The factors give a sequence of unrolling factors to use,
+// which must be either strictly increasing or strictly decreasing
+// and must include 1.
+// For example, 4, 1 means to process 4 elements at a time
+// and then 1 at a time for the final 0-3; specifying 1,4 instead
+// handles 0-3 elements first and then 4 at a time.
+// Similarly, 32, 4, 1 means to process 32 at a time,
+// then 4 at a time, then 1 at a time.
+//
+// One benefit of using 1, 4 instead of 4, 1 is that the body
+// processing 4 at a time needs more registers, and if it is
+// the final body, the register holding the fragment count (0-3)
+// has been freed and is available for use.
+//
+// Start may modify the carry flag.
+//
+// Start must be followed by a call to Loop1 or LoopN,
+// but it is permitted to emit other instructions first,
+// for example to set an initial carry flag.
+func (p *Pipe) Start(n Reg, factors ...int) {
+	a := p.f.Asm
+	if p.started {
+		a.Fatalf("loop already started")
+	}
+	if p.useIndexCounter && len(factors) > 1 {
+		a.Fatalf("cannot call SetUseIndexCounter and then use Start with factors != [1]; have factors = %v", factors)
+	}
+	p.started = true
+	if !p.loaded {
+		if len(factors) == 1 {
+			p.SetUseIndexCounter()
+		}
+		p.LoadPtrs(n)
+	}
+
+	// If there were calls to LoadN between LoadPtrs and Start,
+	// adjust the loop not to scan those columns, assuming that
+	// either the code already called an equivalent StoreN or else
+	// that it will do so after the loop.
+	if off := p.readOff; off != 0 {
+		if p.indexCounter < 0 {
+			// Index is negated, so add off instead of subtracting.
+			a.Add(a.Imm(off), n, n, SmashCarry)
+		} else {
+			a.Sub(a.Imm(off), n, n, SmashCarry)
+		}
+		if p.indexCounter != 0 {
+			// n is also the index we are using, so adjust readOff and writeOff
+			// to continue to point at the same positions as before we changed n.
+			p.readOff -= off
+			p.writeOff -= off
+		}
+	}
+
+	p.Restart(n, factors...)
+}
+
+// Restart prepares to loop over an additional n columns,
+// beyond a previous loop run by p.Start/p.Loop.
+func (p *Pipe) Restart(n Reg, factors ...int) {
+	a := p.f.Asm
+	if !p.started {
+		a.Fatalf("pipe not started")
+	}
+	p.factors = factors
+	p.counts = make([]Reg, len(factors))
+	if len(factors) == 0 {
+		factors = []int{1}
+	}
+
+	// Compute the loop lengths for each unrolled section into separate registers.
+	// We compute them all ahead of time in case the computation would smash
+	// a carry flag that the loop bodies need preserved.
+	if len(factors) > 1 {
+		a.Comment("compute unrolled loop lengths")
+	}
+	switch {
+	default:
+		a.Fatalf("invalid factors %v", factors)
+
+	case factors[0] == 1:
+		// increasing loop factors
+		div := 1
+		for i, f := range factors[1:] {
+			if f <= factors[i] {
+				a.Fatalf("non-increasing factors %v", factors)
+			}
+			if f&(f-1) != 0 {
+				a.Fatalf("non-power-of-two factors %v", factors)
+			}
+			t := p.f.Asm.Reg()
+			f /= div
+			a.And(a.Imm(f-1), n, t)
+			a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, n)
+			div *= f
+			p.counts[i] = t
+		}
+		p.counts[len(p.counts)-1] = n
+
+	case factors[len(factors)-1] == 1:
+		// decreasing loop factors
+		for i, f := range factors[:len(factors)-1] {
+			if f <= factors[i+1] {
+				a.Fatalf("non-decreasing factors %v", factors)
+			}
+			if f&(f-1) != 0 {
+				a.Fatalf("non-power-of-two factors %v", factors)
+			}
+			t := p.f.Asm.Reg()
+			a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, t)
+			a.And(a.Imm(f-1), n, n)
+			p.counts[i] = t
+		}
+		p.counts[len(p.counts)-1] = n
+	}
+}
+
+// Done frees all the registers allocated by the pipe.
+func (p *Pipe) Done() {
+	for _, ptr := range p.inPtr {
+		p.f.Asm.Free(Reg(ptr))
+	}
+	p.inPtr = nil
+	for _, ptr := range p.outPtr {
+		p.f.Asm.Free(Reg(ptr))
+	}
+	p.outPtr = nil
+	p.index = Reg{}
+}
+
+// Loop emits code for the loop, calling block repeatedly to emit code that
+// handles a block of N input columns (for arbitrary N = len(in[0]) chosen by p).
+// block must call p.StoreN(out) to write N output columns.
+// The out slice is a pre-allocated matrix of uninitialized Reg values.
+// block is expected to set each entry to the Reg that should be written
+// before calling p.StoreN(out).
+//
+// For example, if the loop is to be unrolled 4x in blocks of 2 columns each,
+// the sequence of calls to emit the unrolled loop body is:
+//
+//	start()  // set by pAtUnrollStart
+//	... reads for 2 columns ...
+//	block()
+//	... writes for 2 columns ...
+//	... reads for 2 columns ...
+//	block()
+//	... writes for 2 columns ...
+//	end()  // set by p.AtUnrollEnd
+//
+// Any registers allocated during block are freed automatically when block returns.
+func (p *Pipe) Loop(block func(in, out [][]Reg)) {
+	if p.factors == nil {
+		p.f.Asm.Fatalf("Pipe.Start not called")
+	}
+	for i, factor := range p.factors {
+		n := p.counts[i]
+		p.unroll(n, factor, block)
+		if i < len(p.factors)-1 {
+			p.f.Asm.Free(n)
+		}
+	}
+	p.factors = nil
+}
+
+// AtUnrollStart sets a function to call at the start of an unrolled sequence.
+// See [Pipe.Loop] for details.
+func (p *Pipe) AtUnrollStart(start func()) {
+	p.unrollStart = start
+}
+
+// AtUnrollEnd sets a function to call at the end of an unrolled sequence.
+// See [Pipe.Loop] for details.
+func (p *Pipe) AtUnrollEnd(end func()) {
+	p.unrollEnd = end
+}
+
+// unroll emits a single unrolled loop for the given factor, iterating n times.
+func (p *Pipe) unroll(n Reg, factor int, block func(in, out [][]Reg)) {
+	a := p.f.Asm
+	label := fmt.Sprintf("%s%d", p.label, factor)
+
+	// Top of loop control flow.
+	a.Label(label)
+	if a.Arch.loopTop != "" {
+		a.Printf("\t"+a.Arch.loopTop+"\n", n, label+"done")
+	} else {
+		a.JmpZero(n, label+"done")
+	}
+	a.Label(label + "cont")
+
+	// Unrolled loop body.
+	if factor < p.maxColumns {
+		a.Comment("unroll %dX", factor)
+	} else {
+		a.Comment("unroll %dX in batches of %d", factor, p.maxColumns)
+	}
+	if p.unrollStart != nil {
+		p.unrollStart()
+	}
+	for done := 0; done < factor; {
+		batch := min(factor-done, p.maxColumns)
+		regs := a.RegsUsed()
+		out := make([][]Reg, len(p.outPtr))
+		for i := range out {
+			out[i] = make([]Reg, batch)
+		}
+		in := p.LoadN(batch)
+		p.needWrite = true
+		block(in, out)
+		if p.needWrite && len(p.outPtr) > 0 {
+			a.Fatalf("missing p.Write1 or p.StoreN")
+		}
+		a.SetRegsUsed(regs) // free anything block allocated
+		done += batch
+	}
+	if p.unrollEnd != nil {
+		p.unrollEnd()
+	}
+	p.advancePtrs(factor)
+
+	// Bottom of loop control flow.
+	switch {
+	case p.indexCounter >= 0 && a.Arch.loopBottom != "":
+		a.Printf("\t"+a.Arch.loopBottom+"\n", n, label+"cont")
+
+	case p.indexCounter >= 0:
+		a.Sub(a.Imm(1), n, n, KeepCarry)
+		a.JmpNonZero(n, label+"cont")
+
+	case p.indexCounter < 0 && a.Arch.loopBottomNeg != "":
+		a.Printf("\t"+a.Arch.loopBottomNeg+"\n", n, label+"cont")
+
+	case p.indexCounter < 0:
+		a.Add(a.Imm(1), n, n, KeepCarry)
+	}
+	a.Label(label + "done")
+}