cmd/compile: implement compiler for riscv64

Based on riscv-go port. Updates #27532 Change-Id: Ia329daa243db63ff334053b8807ea96b97ce3acf Reviewed-on: https://go-review.googlesource.com/c/go/+/204631 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2025-05-19 06:14:40 +00:00 · 2019-11-04 04:40:47 +11:00 · 2019-11-04 04:40:47 +11:00 · 98d2717499
commit 98d2717499
parent 91d75f4e4c
17 changed files with 8511 additions and 12 deletions
--- a/src/cmd/compile/internal/gc/inl_test.go
+++ b/src/cmd/compile/internal/gc/inl_test.go
@ -175,7 +175,7 @@ func TestIntendedInlining(t *testing.T) {
 	}
 	switch runtime.GOARCH {
-	case "386", "wasm", "arm":
+	case "386", "wasm", "arm", "riscv64":
 	default:
 		// TODO(mvdan): As explained in /test/inline_sync.go, some
 		// architectures don't have atomic intrinsics, so these go over
--- a/src/cmd/compile/internal/gc/plive.go
+++ b/src/cmd/compile/internal/gc/plive.go
@ -705,6 +705,12 @@ func (lv *Liveness) markUnsafePoints() {
 					v = v.Args[0]
 					continue
 				}
 			case ssa.OpRISCV64SUB:
 				// RISCV64 lowers Neq32 to include a SUB with multiple arguments.
 				// TODO(jsing): it would be preferable not to use Neq32 for
 				// writeBuffer.enabled checks on this platform.
 				v = v.Args[0]
 				continue
 			case ssa.Op386MOVLload, ssa.OpARM64MOVWUload, ssa.OpPPC64MOVWZload, ssa.OpWasmI64Load32U:
 				// Args[0] is the address of the write
 				// barrier control. Ignore Args[1],
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@ -6533,7 +6533,7 @@ func (s *SSAGenState) Call(v *ssa.Value) *obj.Prog {
 	} else {
 		// TODO(mdempsky): Can these differences be eliminated?
 		switch thearch.LinkArch.Family {
-		case sys.AMD64, sys.I386, sys.PPC64, sys.S390X, sys.Wasm:
+		case sys.AMD64, sys.I386, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm:
 			p.To.Type = obj.TYPE_REG
 		case sys.ARM, sys.ARM64, sys.MIPS, sys.MIPS64:
 			p.To.Type = obj.TYPE_MEM
--- a/src/cmd/compile/internal/riscv64/galign.go
+++ b/src/cmd/compile/internal/riscv64/galign.go
@ -0,0 +1,25 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package riscv64
 import (
 	"cmd/compile/internal/gc"
 	"cmd/internal/obj/riscv"
 )
 func Init(arch *gc.Arch) {
 	arch.LinkArch = &riscv.LinkRISCV64
 	arch.REGSP = riscv.REG_SP
 	arch.MAXWIDTH = 1 << 50
 	arch.Ginsnop = ginsnop
 	arch.Ginsnopdefer = ginsnop
 	arch.ZeroRange = zeroRange
 	arch.SSAMarkMoves = ssaMarkMoves
 	arch.SSAGenValue = ssaGenValue
 	arch.SSAGenBlock = ssaGenBlock
 }
--- a/src/cmd/compile/internal/riscv64/ggen.go
+++ b/src/cmd/compile/internal/riscv64/ggen.go
@ -0,0 +1,48 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package riscv64
 import (
 	"cmd/compile/internal/gc"
 	"cmd/internal/obj"
 	"cmd/internal/obj/riscv"
 )
 func zeroRange(pp *gc.Progs, p *obj.Prog, off, cnt int64, _ *uint32) *obj.Prog {
 	if cnt == 0 {
 		return p
 	}
 	// Adjust the frame to account for LR.
 	off += gc.Ctxt.FixedFrameSize()
 	if cnt < int64(4*gc.Widthptr) {
 		for i := int64(0); i < cnt; i += int64(gc.Widthptr) {
 			p = pp.Appendpp(p, riscv.AMOV, obj.TYPE_REG, riscv.REG_ZERO, 0, obj.TYPE_MEM, riscv.REG_SP, off+i)
 		}
 		return p
 	}
 	// TODO(jsing): Add a duff zero implementation for medium sized ranges.
 	// Loop, zeroing pointer width bytes at a time.
 	// ADD	$(off), SP, T0
 	// ADD	$(cnt), T0, T1
 	// loop:
 	// 	MOV	ZERO, (T0)
 	// 	ADD	$Widthptr, T0
 	//	BNE	T0, T1, loop
 	p = pp.Appendpp(p, riscv.AADD, obj.TYPE_CONST, 0, off, obj.TYPE_REG, riscv.REG_T0, 0)
 	p.Reg = riscv.REG_SP
 	p = pp.Appendpp(p, riscv.AADD, obj.TYPE_CONST, 0, cnt, obj.TYPE_REG, riscv.REG_T1, 0)
 	p.Reg = riscv.REG_T0
 	p = pp.Appendpp(p, riscv.AMOV, obj.TYPE_REG, riscv.REG_ZERO, 0, obj.TYPE_MEM, riscv.REG_T0, 0)
 	loop := p
 	p = pp.Appendpp(p, riscv.AADD, obj.TYPE_CONST, 0, int64(gc.Widthptr), obj.TYPE_REG, riscv.REG_T0, 0)
 	p = pp.Appendpp(p, riscv.ABNE, obj.TYPE_REG, riscv.REG_T0, 0, obj.TYPE_BRANCH, 0, 0)
 	p.Reg = riscv.REG_T1
 	gc.Patch(p, loop)
 	return p
 }
--- a/src/cmd/compile/internal/riscv64/gsubr.go
+++ b/src/cmd/compile/internal/riscv64/gsubr.go
@ -0,0 +1,20 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package riscv64
 import (
 	"cmd/compile/internal/gc"
 	"cmd/internal/obj"
 	"cmd/internal/obj/riscv"
 )
 func ginsnop(pp *gc.Progs) *obj.Prog {
 	// Hardware nop is ADD $0, ZERO
 	p := pp.Prog(riscv.AADD)
 	p.From.Type = obj.TYPE_CONST
 	p.Reg = riscv.REG_ZERO
 	p.To = obj.Addr{Type: obj.TYPE_REG, Reg: riscv.REG_ZERO}
 	return p
 }
--- a/src/cmd/compile/internal/riscv64/ssa.go
+++ b/src/cmd/compile/internal/riscv64/ssa.go
@ -0,0 +1,496 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package riscv64
 import (
 	"cmd/compile/internal/gc"
 	"cmd/compile/internal/ssa"
 	"cmd/compile/internal/types"
 	"cmd/internal/obj"
 	"cmd/internal/obj/riscv"
 )
 // ssaRegToReg maps ssa register numbers to obj register numbers.
 var ssaRegToReg = []int16{
 	riscv.REG_X0,
 	// X1 (LR): unused
 	riscv.REG_X2,
 	riscv.REG_X3,
 	riscv.REG_X4,
 	riscv.REG_X5,
 	riscv.REG_X6,
 	riscv.REG_X7,
 	riscv.REG_X8,
 	riscv.REG_X9,
 	riscv.REG_X10,
 	riscv.REG_X11,
 	riscv.REG_X12,
 	riscv.REG_X13,
 	riscv.REG_X14,
 	riscv.REG_X15,
 	riscv.REG_X16,
 	riscv.REG_X17,
 	riscv.REG_X18,
 	riscv.REG_X19,
 	riscv.REG_X20,
 	riscv.REG_X21,
 	riscv.REG_X22,
 	riscv.REG_X23,
 	riscv.REG_X24,
 	riscv.REG_X25,
 	riscv.REG_X26,
 	riscv.REG_X27,
 	riscv.REG_X28,
 	riscv.REG_X29,
 	riscv.REG_X30,
 	riscv.REG_X31,
 	riscv.REG_F0,
 	riscv.REG_F1,
 	riscv.REG_F2,
 	riscv.REG_F3,
 	riscv.REG_F4,
 	riscv.REG_F5,
 	riscv.REG_F6,
 	riscv.REG_F7,
 	riscv.REG_F8,
 	riscv.REG_F9,
 	riscv.REG_F10,
 	riscv.REG_F11,
 	riscv.REG_F12,
 	riscv.REG_F13,
 	riscv.REG_F14,
 	riscv.REG_F15,
 	riscv.REG_F16,
 	riscv.REG_F17,
 	riscv.REG_F18,
 	riscv.REG_F19,
 	riscv.REG_F20,
 	riscv.REG_F21,
 	riscv.REG_F22,
 	riscv.REG_F23,
 	riscv.REG_F24,
 	riscv.REG_F25,
 	riscv.REG_F26,
 	riscv.REG_F27,
 	riscv.REG_F28,
 	riscv.REG_F29,
 	riscv.REG_F30,
 	riscv.REG_F31,
 	0, // SB isn't a real register.  We fill an Addr.Reg field with 0 in this case.
 }
 func loadByType(t *types.Type) obj.As {
 	width := t.Size()
 	if t.IsFloat() {
 		switch width {
 		case 4:
 			return riscv.AMOVF
 		case 8:
 			return riscv.AMOVD
 		default:
 			gc.Fatalf("unknown float width for load %d in type %v", width, t)
 			return 0
 		}
 	}
 	switch width {
 	case 1:
 		if t.IsSigned() {
 			return riscv.AMOVB
 		} else {
 			return riscv.AMOVBU
 		}
 	case 2:
 		if t.IsSigned() {
 			return riscv.AMOVH
 		} else {
 			return riscv.AMOVHU
 		}
 	case 4:
 		if t.IsSigned() {
 			return riscv.AMOVW
 		} else {
 			return riscv.AMOVWU
 		}
 	case 8:
 		return riscv.AMOV
 	default:
 		gc.Fatalf("unknown width for load %d in type %v", width, t)
 		return 0
 	}
 }
 // storeByType returns the store instruction of the given type.
 func storeByType(t *types.Type) obj.As {
 	width := t.Size()
 	if t.IsFloat() {
 		switch width {
 		case 4:
 			return riscv.AMOVF
 		case 8:
 			return riscv.AMOVD
 		default:
 			gc.Fatalf("unknown float width for store %d in type %v", width, t)
 			return 0
 		}
 	}
 	switch width {
 	case 1:
 		return riscv.AMOVB
 	case 2:
 		return riscv.AMOVH
 	case 4:
 		return riscv.AMOVW
 	case 8:
 		return riscv.AMOV
 	default:
 		gc.Fatalf("unknown width for store %d in type %v", width, t)
 		return 0
 	}
 }
 // largestMove returns the largest move instruction possible and its size,
 // given the alignment of the total size of the move.
 //
 // e.g., a 16-byte move may use MOV, but an 11-byte move must use MOVB.
 //
 // Note that the moves may not be on naturally aligned addresses depending on
 // the source and destination.
 //
 // This matches the calculation in ssa.moveSize.
 func largestMove(alignment int64) (obj.As, int64) {
 	switch {
 	case alignment%8 == 0:
 		return riscv.AMOV, 8
 	case alignment%4 == 0:
 		return riscv.AMOVW, 4
 	case alignment%2 == 0:
 		return riscv.AMOVH, 2
 	default:
 		return riscv.AMOVB, 1
 	}
 }
 // markMoves marks any MOVXconst ops that need to avoid clobbering flags.
 // RISC-V has no flags, so this is a no-op.
 func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {}
 func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	s.SetPos(v.Pos)
 	switch v.Op {
 	case ssa.OpInitMem:
 		// memory arg needs no code
 	case ssa.OpArg:
 		// input args need no code
 	case ssa.OpPhi:
 		gc.CheckLoweredPhi(v)
 	case ssa.OpCopy, ssa.OpRISCV64MOVconvert:
 		if v.Type.IsMemory() {
 			return
 		}
 		rs := v.Args[0].Reg()
 		rd := v.Reg()
 		if rs == rd {
 			return
 		}
 		as := riscv.AMOV
 		if v.Type.IsFloat() {
 			as = riscv.AMOVD
 		}
 		p := s.Prog(as)
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = rs
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = rd
 	case ssa.OpLoadReg:
 		if v.Type.IsFlags() {
 			v.Fatalf("load flags not implemented: %v", v.LongString())
 			return
 		}
 		p := s.Prog(loadByType(v.Type))
 		gc.AddrAuto(&p.From, v.Args[0])
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 	case ssa.OpStoreReg:
 		if v.Type.IsFlags() {
 			v.Fatalf("store flags not implemented: %v", v.LongString())
 			return
 		}
 		p := s.Prog(storeByType(v.Type))
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = v.Args[0].Reg()
 		gc.AddrAuto(&p.To, v)
 	case ssa.OpSP, ssa.OpSB, ssa.OpGetG:
 		// nothing to do
 	case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND,
 		ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRL,
 		ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH,
 		ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW,
 		ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW,
 		ssa.OpRISCV64REMUW,
 		ssa.OpRISCV64FADDS, ssa.OpRISCV64FSUBS, ssa.OpRISCV64FMULS, ssa.OpRISCV64FDIVS,
 		ssa.OpRISCV64FEQS, ssa.OpRISCV64FNES, ssa.OpRISCV64FLTS, ssa.OpRISCV64FLES,
 		ssa.OpRISCV64FADDD, ssa.OpRISCV64FSUBD, ssa.OpRISCV64FMULD, ssa.OpRISCV64FDIVD,
 		ssa.OpRISCV64FEQD, ssa.OpRISCV64FNED, ssa.OpRISCV64FLTD, ssa.OpRISCV64FLED:
 		r := v.Reg()
 		r1 := v.Args[0].Reg()
 		r2 := v.Args[1].Reg()
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = r2
 		p.Reg = r1
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
 	case ssa.OpRISCV64FSQRTS, ssa.OpRISCV64FNEGS, ssa.OpRISCV64FSQRTD, ssa.OpRISCV64FNEGD,
 		ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVDX,
 		ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS,
 		ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = v.Args[0].Reg()
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 	case ssa.OpRISCV64ADDI, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI,
 		ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRLI, ssa.OpRISCV64SLTI,
 		ssa.OpRISCV64SLTIU:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = v.AuxInt
 		p.Reg = v.Args[0].Reg()
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 	case ssa.OpRISCV64MOVBconst, ssa.OpRISCV64MOVHconst, ssa.OpRISCV64MOVWconst, ssa.OpRISCV64MOVDconst:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = v.AuxInt
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 	case ssa.OpRISCV64MOVaddr:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_ADDR
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 		var wantreg string
 		// MOVW $sym+off(base), R
 		switch v.Aux.(type) {
 		default:
 			v.Fatalf("aux is of unknown type %T", v.Aux)
 		case *obj.LSym:
 			wantreg = "SB"
 			gc.AddAux(&p.From, v)
 		case *gc.Node:
 			wantreg = "SP"
 			gc.AddAux(&p.From, v)
 		case nil:
 			// No sym, just MOVW $off(SP), R
 			wantreg = "SP"
 			p.From.Reg = riscv.REG_SP
 			p.From.Offset = v.AuxInt
 		}
 		if reg := v.Args[0].RegName(); reg != wantreg {
 			v.Fatalf("bad reg %s for symbol type %T, want %s", reg, v.Aux, wantreg)
 		}
 	case ssa.OpRISCV64MOVBload, ssa.OpRISCV64MOVHload, ssa.OpRISCV64MOVWload, ssa.OpRISCV64MOVDload,
 		ssa.OpRISCV64MOVBUload, ssa.OpRISCV64MOVHUload, ssa.OpRISCV64MOVWUload,
 		ssa.OpRISCV64FMOVWload, ssa.OpRISCV64FMOVDload:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = v.Args[0].Reg()
 		gc.AddAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 	case ssa.OpRISCV64MOVBstore, ssa.OpRISCV64MOVHstore, ssa.OpRISCV64MOVWstore, ssa.OpRISCV64MOVDstore,
 		ssa.OpRISCV64FMOVWstore, ssa.OpRISCV64FMOVDstore:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = v.Args[1].Reg()
 		p.To.Type = obj.TYPE_MEM
 		p.To.Reg = v.Args[0].Reg()
 		gc.AddAux(&p.To, v)
 	case ssa.OpRISCV64SEQZ, ssa.OpRISCV64SNEZ:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = v.Args[0].Reg()
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 	case ssa.OpRISCV64CALLstatic, ssa.OpRISCV64CALLclosure, ssa.OpRISCV64CALLinter:
 		s.Call(v)
 	case ssa.OpRISCV64LoweredWB:
 		p := s.Prog(obj.ACALL)
 		p.To.Type = obj.TYPE_MEM
 		p.To.Name = obj.NAME_EXTERN
 		p.To.Sym = v.Aux.(*obj.LSym)
 	case ssa.OpRISCV64LoweredPanicBoundsA, ssa.OpRISCV64LoweredPanicBoundsB, ssa.OpRISCV64LoweredPanicBoundsC:
 		p := s.Prog(obj.ACALL)
 		p.To.Type = obj.TYPE_MEM
 		p.To.Name = obj.NAME_EXTERN
 		p.To.Sym = gc.BoundsCheckFunc[v.AuxInt]
 		s.UseArgs(16) // space used in callee args area by assembly stubs
 	case ssa.OpRISCV64LoweredZero:
 		mov, sz := largestMove(v.AuxInt)
 		//	mov	ZERO, (Rarg0)
 		//	ADD	$sz, Rarg0
 		//	BGEU	Rarg1, Rarg0, -2(PC)
 		p := s.Prog(mov)
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = riscv.REG_ZERO
 		p.To.Type = obj.TYPE_MEM
 		p.To.Reg = v.Args[0].Reg()
 		p2 := s.Prog(riscv.AADD)
 		p2.From.Type = obj.TYPE_CONST
 		p2.From.Offset = sz
 		p2.To.Type = obj.TYPE_REG
 		p2.To.Reg = v.Args[0].Reg()
 		p3 := s.Prog(riscv.ABGEU)
 		p3.To.Type = obj.TYPE_BRANCH
 		p3.Reg = v.Args[0].Reg()
 		p3.From.Type = obj.TYPE_REG
 		p3.From.Reg = v.Args[1].Reg()
 		gc.Patch(p3, p)
 	case ssa.OpRISCV64LoweredMove:
 		mov, sz := largestMove(v.AuxInt)
 		//	mov	(Rarg1), T2
 		//	mov	T2, (Rarg0)
 		//	ADD	$sz, Rarg0
 		//	ADD	$sz, Rarg1
 		//	BGEU	Rarg2, Rarg0, -4(PC)
 		p := s.Prog(mov)
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = v.Args[1].Reg()
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = riscv.REG_T2
 		p2 := s.Prog(mov)
 		p2.From.Type = obj.TYPE_REG
 		p2.From.Reg = riscv.REG_T2
 		p2.To.Type = obj.TYPE_MEM
 		p2.To.Reg = v.Args[0].Reg()
 		p3 := s.Prog(riscv.AADD)
 		p3.From.Type = obj.TYPE_CONST
 		p3.From.Offset = sz
 		p3.To.Type = obj.TYPE_REG
 		p3.To.Reg = v.Args[0].Reg()
 		p4 := s.Prog(riscv.AADD)
 		p4.From.Type = obj.TYPE_CONST
 		p4.From.Offset = sz
 		p4.To.Type = obj.TYPE_REG
 		p4.To.Reg = v.Args[1].Reg()
 		p5 := s.Prog(riscv.ABGEU)
 		p5.To.Type = obj.TYPE_BRANCH
 		p5.Reg = v.Args[1].Reg()
 		p5.From.Type = obj.TYPE_REG
 		p5.From.Reg = v.Args[2].Reg()
 		gc.Patch(p5, p)
 	case ssa.OpRISCV64LoweredNilCheck:
 		// Issue a load which will fault if arg is nil.
 		// TODO: optimizations. See arm and amd64 LoweredNilCheck.
 		p := s.Prog(riscv.AMOVB)
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = v.Args[0].Reg()
 		gc.AddAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = riscv.REG_ZERO
 		if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos == 1 in generated wrappers
 			gc.Warnl(v.Pos, "generated nil check")
 		}
 	case ssa.OpRISCV64LoweredGetClosurePtr:
 		// Closure pointer is S4 (riscv.REG_CTXT).
 		gc.CheckLoweredGetClosurePtr(v)
 	case ssa.OpRISCV64LoweredGetCallerSP:
 		// caller's SP is FixedFrameSize below the address of the first arg
 		p := s.Prog(riscv.AMOV)
 		p.From.Type = obj.TYPE_ADDR
 		p.From.Offset = -gc.Ctxt.FixedFrameSize()
 		p.From.Name = obj.NAME_PARAM
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 	case ssa.OpRISCV64LoweredGetCallerPC:
 		p := s.Prog(obj.AGETCALLERPC)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
 	default:
 		v.Fatalf("Unhandled op %v", v.Op)
 	}
 }
 func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
 	s.SetPos(b.Pos)
 	switch b.Kind {
 	case ssa.BlockDefer:
 		// defer returns in A0:
 		// 0 if we should continue executing
 		// 1 if we should jump to deferreturn call
 		p := s.Prog(riscv.ABNE)
 		p.To.Type = obj.TYPE_BRANCH
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = riscv.REG_ZERO
 		p.Reg = riscv.REG_A0
 		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
 		if b.Succs[0].Block() != next {
 			p := s.Prog(obj.AJMP)
 			p.To.Type = obj.TYPE_BRANCH
 			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
 		}
 	case ssa.BlockPlain:
 		if b.Succs[0].Block() != next {
 			p := s.Prog(obj.AJMP)
 			p.To.Type = obj.TYPE_BRANCH
 			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
 		}
 	case ssa.BlockExit:
 	case ssa.BlockRet:
 		s.Prog(obj.ARET)
 	case ssa.BlockRetJmp:
 		p := s.Prog(obj.AJMP)
 		p.To.Type = obj.TYPE_MEM
 		p.To.Name = obj.NAME_EXTERN
 		p.To.Sym = b.Aux.(*obj.LSym)
 	case ssa.BlockRISCV64BNE:
 		var p *obj.Prog
 		switch next {
 		case b.Succs[0].Block():
 			p = s.Br(riscv.ABNE, b.Succs[1].Block())
 			p.As = riscv.InvertBranch(p.As)
 		case b.Succs[1].Block():
 			p = s.Br(riscv.ABNE, b.Succs[0].Block())
 		default:
 			if b.Likely != ssa.BranchUnlikely {
 				p = s.Br(riscv.ABNE, b.Succs[0].Block())
 				s.Br(obj.AJMP, b.Succs[1].Block())
 			} else {
 				p = s.Br(riscv.ABNE, b.Succs[1].Block())
 				p.As = riscv.InvertBranch(p.As)
 				s.Br(obj.AJMP, b.Succs[0].Block())
 			}
 		}
 		p.Reg = b.Controls[0].Reg()
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = riscv.REG_ZERO
 	default:
 		b.Fatalf("Unhandled block: %s", b.LongString())
 	}
 }
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@ -305,6 +305,16 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize bool) *Config
 		c.LinkReg = linkRegMIPS
 		c.hasGReg = true
 		c.noDuffDevice = true
 	case "riscv64":
 		c.PtrSize = 8
 		c.RegSize = 8
 		c.lowerBlock = rewriteBlockRISCV64
 		c.lowerValue = rewriteValueRISCV64
 		c.registers = registersRISCV64[:]
 		c.gpRegMask = gpRegMaskRISCV64
 		c.fpRegMask = fpRegMaskRISCV64
 		c.FPReg = framepointerRegRISCV64
 		c.hasGReg = true
 	case "wasm":
 		c.PtrSize = 8
 		c.RegSize = 8
--- a/src/cmd/compile/internal/ssa/gen/RISCV64.rules
+++ b/src/cmd/compile/internal/ssa/gen/RISCV64.rules
@ -0,0 +1,478 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Optimizations TODO:
 // * Somehow track when values are already zero/signed-extended, avoid re-extending.
 // * Use SLTI and SLTIU for comparisons to constants, instead of SLT/SLTU with constants in registers
 // * Find a more efficient way to do zero/sign extension than left+right shift.
 //   There are many other options (store then load-extend, LUI+ANDI for zero extend, special case 32->64, ...),
 //   but left+right shift is simple and uniform, and we don't have real hardware to do perf testing on anyway.
 // * Use the zero register instead of moving 0 into a register.
 // * Add rules to avoid generating a temp bool value for (If (SLT[U] ...) ...).
 // * Optimize left and right shift by simplifying SLTIU, Neg, and ADD for constants.
 // * Arrange for non-trivial Zero and Move lowerings to use aligned loads and stores.
 // * Eliminate zero immediate shifts, adds, etc.
 // * Use a Duff's device for some moves and zeros.
 // * Avoid using Neq32 for writeBarrier.enabled checks.
 // Lowering arithmetic
 (Add64 x y) -> (ADD x y)
 (AddPtr x y) -> (ADD x y)
 (Add32 x y) -> (ADD x y)
 (Add16 x y) -> (ADD x y)
 (Add8 x y) -> (ADD x y)
 (Add32F x y) -> (FADDS x y)
 (Add64F x y) -> (FADDD x y)
 (Sub64 x y) -> (SUB x y)
 (SubPtr x y) -> (SUB x y)
 (Sub32 x y) -> (SUB x y)
 (Sub16 x y) -> (SUB x y)
 (Sub8 x y) -> (SUB x y)
 (Sub32F x y) -> (FSUBS x y)
 (Sub64F x y) -> (FSUBD x y)
 (Mul64 x y) -> (MUL  x y)
 (Mul32 x y) -> (MULW x y)
 (Mul16 x y) -> (MULW (SignExt16to32 x) (SignExt16to32 y))
 (Mul8 x y)  -> (MULW (SignExt8to32 x)  (SignExt8to32 y))
 (Mul32F x y) -> (FMULS x y)
 (Mul64F x y) -> (FMULD x y)
 (Div32F x y) -> (FDIVS x y)
 (Div64F x y) -> (FDIVD x y)
 (Div64 x y)  -> (DIV   x y)
 (Div64u x y) -> (DIVU  x y)
 (Div32 x y)  -> (DIVW  x y)
 (Div32u x y) -> (DIVUW x y)
 (Div16 x y)  -> (DIVW  (SignExt16to32 x) (SignExt16to32 y))
 (Div16u x y) -> (DIVUW (ZeroExt16to32 x) (ZeroExt16to32 y))
 (Div8 x y)   -> (DIVW  (SignExt8to32 x)  (SignExt8to32 y))
 (Div8u x y)  -> (DIVUW (ZeroExt8to32 x)  (ZeroExt8to32 y))
 (Hmul64 x y)  -> (MULH  x y)
 (Hmul64u x y) -> (MULHU x y)
 (Hmul32 x y)  -> (SRAI [32] (MUL  (SignExt32to64 x) (SignExt32to64 y)))
 (Hmul32u x y) -> (SRLI [32] (MUL  (ZeroExt32to64 x) (ZeroExt32to64 y)))
 // (x + y) / 2 -> (x / 2) + (y / 2) + (x & y & 1)
 (Avg64u <t> x y) -> (ADD (ADD <t> (SRLI <t> [1] x) (SRLI <t> [1] y)) (ANDI <t> [1] (AND <t> x y)))
 (Mod64 x y)  -> (REM   x y)
 (Mod64u x y) -> (REMU  x y)
 (Mod32 x y)  -> (REMW  x y)
 (Mod32u x y) -> (REMUW x y)
 (Mod16 x y)  -> (REMW  (SignExt16to32 x) (SignExt16to32 y))
 (Mod16u x y) -> (REMUW (ZeroExt16to32 x) (ZeroExt16to32 y))
 (Mod8 x y)   -> (REMW  (SignExt8to32 x)  (SignExt8to32 y))
 (Mod8u x y)  -> (REMUW (ZeroExt8to32 x)  (ZeroExt8to32 y))
 (And64 x y) -> (AND x y)
 (And32 x y) -> (AND x y)
 (And16 x y) -> (AND x y)
 (And8  x y) -> (AND x y)
 (Or64 x y) -> (OR x y)
 (Or32 x y) -> (OR x y)
 (Or16 x y) -> (OR x y)
 (Or8  x y) -> (OR x y)
 (Xor64 x y) -> (XOR x y)
 (Xor32 x y) -> (XOR x y)
 (Xor16 x y) -> (XOR x y)
 (Xor8  x y) -> (XOR x y)
 (Neg64 x) -> (SUB (MOVDconst) x)
 (Neg32 x) -> (SUB (MOVWconst) x)
 (Neg16 x) -> (SUB (MOVHconst) x)
 (Neg8  x) -> (SUB (MOVBconst) x)
 (Neg32F x) -> (FNEGS x)
 (Neg64F x) -> (FNEGD x)
 (Com64 x) -> (XORI [int64(-1)] x)
 (Com32 x) -> (XORI [int64(-1)] x)
 (Com16 x) -> (XORI [int64(-1)] x)
 (Com8  x) -> (XORI [int64(-1)] x)
 (Sqrt x) -> (FSQRTD x)
 // Zero and sign extension
 // Shift left until the bits we want are at the top of the register.
 // Then logical/arithmetic shift right for zero/sign extend.
 // We always extend to 64 bits; there's no reason not to,
 // and optimization rules can then collapse some extensions.
 (SignExt8to16  <t> x) -> (SRAI [56] (SLLI <t> [56] x))
 (SignExt8to32  <t> x) -> (SRAI [56] (SLLI <t> [56] x))
 (SignExt8to64  <t> x) -> (SRAI [56] (SLLI <t> [56] x))
 (SignExt16to32 <t> x) -> (SRAI [48] (SLLI <t> [48] x))
 (SignExt16to64 <t> x) -> (SRAI [48] (SLLI <t> [48] x))
 (SignExt32to64 <t> x) -> (SRAI [32] (SLLI <t> [32] x))
 (ZeroExt8to16  <t> x) -> (SRLI [56] (SLLI <t> [56] x))
 (ZeroExt8to32  <t> x) -> (SRLI [56] (SLLI <t> [56] x))
 (ZeroExt8to64  <t> x) -> (SRLI [56] (SLLI <t> [56] x))
 (ZeroExt16to32 <t> x) -> (SRLI [48] (SLLI <t> [48] x))
 (ZeroExt16to64 <t> x) -> (SRLI [48] (SLLI <t> [48] x))
 (ZeroExt32to64 <t> x) -> (SRLI [32] (SLLI <t> [32] x))
 (Cvt32to32F x) -> (FCVTSW x)
 (Cvt32to64F x) -> (FCVTDW x)
 (Cvt64to32F x) -> (FCVTSL x)
 (Cvt64to64F x) -> (FCVTDL x)
 (Cvt32Fto32 x) -> (FCVTWS x)
 (Cvt32Fto64 x) -> (FCVTLS x)
 (Cvt64Fto32 x) -> (FCVTWD x)
 (Cvt64Fto64 x) -> (FCVTLD x)
 (Cvt32Fto64F x) -> (FCVTDS x)
 (Cvt64Fto32F x) -> (FCVTSD x)
 (Round32F x) -> x
 (Round64F x) -> x
 // From genericOps.go:
 // "0 if arg0 == 0, -1 if arg0 > 0, undef if arg0<0"
 //
 // Like other arches, we compute ~((x-1) >> 63), with arithmetic right shift.
 // For positive x, bit 63 of x-1 is always 0, so the result is -1.
 // For zero x, bit 63 of x-1 is 1, so the result is 0.
 //
 // TODO(prattmic): Use XORconst etc instead of XOR (MOVDconst).
 (Slicemask <t> x) -> (XOR (MOVDconst [-1]) (SRA <t> (SUB <t> x (MOVDconst [1])) (MOVDconst [63])))
 // Truncations
 // We ignore the unused high parts of registers, so truncates are just copies.
 (Trunc16to8  x) -> x
 (Trunc32to8  x) -> x
 (Trunc32to16 x) -> x
 (Trunc64to8  x) -> x
 (Trunc64to16 x) -> x
 (Trunc64to32 x) -> x
 // Shifts
 // SLL only considers the bottom 6 bits of y. If y > 64, the result should
 // always be 0.
 //
 // Breaking down the operation:
 //
 // (SLL x y) generates x << (y & 63).
 //
 // If y < 64, this is the value we want. Otherwise, we want zero.
 //
 // So, we AND with -1 * uint64(y < 64), which is 0xfffff... if y < 64 and 0 otherwise.
 (Lsh8x8   <t> x y) -> (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
 (Lsh8x16  <t> x y) -> (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
 (Lsh8x32  <t> x y) -> (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
 (Lsh8x64  <t> x y) -> (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] y)))
 (Lsh16x8  <t> x y) -> (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
 (Lsh16x16 <t> x y) -> (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
 (Lsh16x32 <t> x y) -> (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
 (Lsh16x64 <t> x y) -> (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] y)))
 (Lsh32x8  <t> x y) -> (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
 (Lsh32x16 <t> x y) -> (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
 (Lsh32x32 <t> x y) -> (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
 (Lsh32x64 <t> x y) -> (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] y)))
 (Lsh64x8  <t> x y) -> (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
 (Lsh64x16 <t> x y) -> (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
 (Lsh64x32 <t> x y) -> (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
 (Lsh64x64 <t> x y) -> (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] y)))
 // SRL only considers the bottom 6 bits of y. If y > 64, the result should
 // always be 0. See Lsh above for a detailed description.
 (Rsh8Ux8   <t> x y) -> (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
 (Rsh8Ux16  <t> x y) -> (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
 (Rsh8Ux32  <t> x y) -> (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
 (Rsh8Ux64  <t> x y) -> (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] y)))
 (Rsh16Ux8  <t> x y) -> (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
 (Rsh16Ux16 <t> x y) -> (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
 (Rsh16Ux32 <t> x y) -> (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
 (Rsh16Ux64 <t> x y) -> (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y)))
 (Rsh32Ux8  <t> x y) -> (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
 (Rsh32Ux16 <t> x y) -> (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
 (Rsh32Ux32 <t> x y) -> (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
 (Rsh32Ux64 <t> x y) -> (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] y)))
 (Rsh64Ux8  <t> x y) -> (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
 (Rsh64Ux16 <t> x y) -> (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
 (Rsh64Ux32 <t> x y) -> (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
 (Rsh64Ux64 <t> x y) -> (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] y)))
 // SRA only considers the bottom 6 bits of y. If y > 64, the result should
 // be either 0 or -1 based on the sign bit.
 //
 // We implement this by performing the max shift (-1) if y >= 64.
 //
 // We OR (uint64(y < 64) - 1) into y before passing it to SRA. This leaves
 // us with -1 (0xffff...) if y >= 64.
 //
 // We don't need to sign-extend the OR result, as it will be at minimum 8 bits,
 // more than the 6 bits SRA cares about.
 (Rsh8x8   <t> x y) -> (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
 (Rsh8x16  <t> x y) -> (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
 (Rsh8x32  <t> x y) -> (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
 (Rsh8x64  <t> x y) -> (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
 (Rsh16x8  <t> x y) -> (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
 (Rsh16x16 <t> x y) -> (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
 (Rsh16x32 <t> x y) -> (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
 (Rsh16x64 <t> x y) -> (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
 (Rsh32x8  <t> x y) -> (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
 (Rsh32x16 <t> x y) -> (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
 (Rsh32x32 <t> x y) -> (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
 (Rsh32x64 <t> x y) -> (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
 (Rsh64x8  <t> x y) -> (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
 (Rsh64x16 <t> x y) -> (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
 (Rsh64x32 <t> x y) -> (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
 (Rsh64x64 <t> x y) -> (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
 // rotates
 (RotateLeft8 <t> x (MOVBconst [c])) -> (Or8 (Lsh8x64 <t> x (MOVBconst [c&7])) (Rsh8Ux64 <t> x (MOVBconst [-c&7])))
 (RotateLeft16 <t> x (MOVHconst [c])) -> (Or16 (Lsh16x64 <t> x (MOVHconst [c&15])) (Rsh16Ux64 <t> x (MOVHconst [-c&15])))
 (RotateLeft32 <t> x (MOVWconst [c])) -> (Or32 (Lsh32x64 <t> x (MOVWconst [c&31])) (Rsh32Ux64 <t> x (MOVWconst [-c&31])))
 (RotateLeft64 <t> x (MOVDconst [c])) -> (Or64 (Lsh64x64 <t> x (MOVDconst [c&63])) (Rsh64Ux64 <t> x (MOVDconst [-c&63])))
 (Less64  x y) -> (SLT  x y)
 (Less32  x y) -> (SLT  (SignExt32to64 x) (SignExt32to64 y))
 (Less16  x y) -> (SLT  (SignExt16to64 x) (SignExt16to64 y))
 (Less8   x y) -> (SLT  (SignExt8to64  x) (SignExt8to64  y))
 (Less64U x y) -> (SLTU x y)
 (Less32U x y) -> (SLTU (ZeroExt32to64 x) (ZeroExt32to64 y))
 (Less16U x y) -> (SLTU (ZeroExt16to64 x) (ZeroExt16to64 y))
 (Less8U  x y) -> (SLTU (ZeroExt8to64  x) (ZeroExt8to64  y))
 (Less64F x y) -> (FLTD x y)
 (Less32F x y) -> (FLTS x y)
 // Convert x <= y to !(y > x).
 (Leq64  x y) -> (Not (Less64  y x))
 (Leq32  x y) -> (Not (Less32  y x))
 (Leq16  x y) -> (Not (Less16  y x))
 (Leq8   x y) -> (Not (Less8   y x))
 (Leq64U x y) -> (Not (Less64U y x))
 (Leq32U x y) -> (Not (Less32U y x))
 (Leq16U x y) -> (Not (Less16U y x))
 (Leq8U  x y) -> (Not (Less8U  y x))
 (Leq64F x y) -> (FLED x y)
 (Leq32F x y) -> (FLES x y)
 // Convert x > y to y < x.
 (Greater64  x y) -> (Less64  y x)
 (Greater32  x y) -> (Less32  y x)
 (Greater16  x y) -> (Less16  y x)
 (Greater8   x y) -> (Less8   y x)
 (Greater64U x y) -> (Less64U y x)
 (Greater32U x y) -> (Less32U y x)
 (Greater16U x y) -> (Less16U y x)
 (Greater8U  x y) -> (Less8U  y x)
 (Greater64F x y) -> (FLTD y x)
 (Greater32F x y) -> (FLTS y x)
 // Convert x >= y to !(x < y)
 (Geq64  x y) -> (Not (Less64  x y))
 (Geq32  x y) -> (Not (Less32  x y))
 (Geq16  x y) -> (Not (Less16  x y))
 (Geq8   x y) -> (Not (Less8   x y))
 (Geq64U x y) -> (Not (Less64U x y))
 (Geq32U x y) -> (Not (Less32U x y))
 (Geq16U x y) -> (Not (Less16U x y))
 (Geq8U  x y) -> (Not (Less8U  x y))
 (Geq64F x y) -> (FLED y x)
 (Geq32F x y) -> (FLES y x)
 (EqPtr x y) -> (SEQZ (SUB <x.Type> x y))
 (Eq64  x y) -> (SEQZ (SUB <x.Type> x y))
 (Eq32  x y) -> (SEQZ (ZeroExt32to64 (SUB <x.Type> x y)))
 (Eq16  x y) -> (SEQZ (ZeroExt16to64 (SUB <x.Type> x y)))
 (Eq8   x y) -> (SEQZ (ZeroExt8to64  (SUB <x.Type> x y)))
 (Eq64F x y) -> (FEQD x y)
 (Eq32F x y) -> (FEQS x y)
 (NeqPtr x y) -> (SNEZ (SUB <x.Type> x y))
 (Neq64  x y) -> (SNEZ (SUB <x.Type> x y))
 (Neq32  x y) -> (SNEZ (ZeroExt32to64 (SUB <x.Type> x y)))
 (Neq16  x y) -> (SNEZ (ZeroExt16to64 (SUB <x.Type> x y)))
 (Neq8   x y) -> (SNEZ (ZeroExt8to64  (SUB <x.Type> x y)))
 (Neq64F x y) -> (FNED x y)
 (Neq32F x y) -> (FNES x y)
 // Loads
 (Load <t> ptr mem) &&  t.IsBoolean()                  -> (MOVBUload ptr mem)
 (Load <t> ptr mem) && ( is8BitInt(t) &&  isSigned(t)) -> (MOVBload  ptr mem)
 (Load <t> ptr mem) && ( is8BitInt(t) && !isSigned(t)) -> (MOVBUload ptr mem)
 (Load <t> ptr mem) && (is16BitInt(t) &&  isSigned(t)) -> (MOVHload  ptr mem)
 (Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) -> (MOVHUload ptr mem)
 (Load <t> ptr mem) && (is32BitInt(t) &&  isSigned(t)) -> (MOVWload  ptr mem)
 (Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) -> (MOVWUload ptr mem)
 (Load <t> ptr mem) && (is64BitInt(t) || isPtr(t))     -> (MOVDload  ptr mem)
 (Load <t> ptr mem) &&  is32BitFloat(t)                -> (FMOVWload ptr mem)
 (Load <t> ptr mem) &&  is64BitFloat(t)                -> (FMOVDload ptr mem)
 // Stores
 (Store {t} ptr val mem) && t.(*types.Type).Size() == 1 -> (MOVBstore ptr val mem)
 (Store {t} ptr val mem) && t.(*types.Type).Size() == 2 -> (MOVHstore ptr val mem)
 (Store {t} ptr val mem) && t.(*types.Type).Size() == 4 && !is32BitFloat(val.Type) -> (MOVWstore ptr val mem)
 (Store {t} ptr val mem) && t.(*types.Type).Size() == 8 && !is64BitFloat(val.Type) -> (MOVDstore ptr val mem)
 (Store {t} ptr val mem) && t.(*types.Type).Size() == 4 && is32BitFloat(val.Type) -> (FMOVWstore ptr val mem)
 (Store {t} ptr val mem) && t.(*types.Type).Size() == 8 && is64BitFloat(val.Type) -> (FMOVDstore ptr val mem)
 // We need to fold MOVaddr into the LD/MOVDstore ops so that the live variable analysis
 // knows what variables are being read/written by the ops.
 (MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVBload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVBload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVHload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVHload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVWload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVWload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVDload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVDload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
 (MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
 (MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
 (MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
 (MOVBUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
 	(MOVBUload [off1+off2] {sym} base mem)
 (MOVBload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
 	(MOVBload  [off1+off2] {sym} base mem)
 (MOVHUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
 	(MOVHUload [off1+off2] {sym} base mem)
 (MOVHload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
 	(MOVHload  [off1+off2] {sym} base mem)
 (MOVWUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
 	(MOVWUload [off1+off2] {sym} base mem)
 (MOVWload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
 	(MOVWload  [off1+off2] {sym} base mem)
 (MOVDload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(off1+off2) ->
 	(MOVDload  [off1+off2] {sym} base mem)
 (MOVBstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(off1+off2) ->
 	(MOVBstore [off1+off2] {sym} base val mem)
 (MOVHstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(off1+off2) ->
 	(MOVHstore [off1+off2] {sym} base val mem)
 (MOVWstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(off1+off2) ->
 	(MOVWstore [off1+off2] {sym} base val mem)
 (MOVDstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(off1+off2) ->
 	(MOVDstore [off1+off2] {sym} base val mem)
 // Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis
 // with OffPtr -> ADDI.
 (ADDI [c] (MOVaddr [d] {s} x)) && is32Bit(c+d) -> (MOVaddr [c+d] {s} x)
 // Zeroing
 // TODO: more optimized zeroing, including attempting to use aligned accesses.
 (Zero [0]   _ mem) -> mem
 (Zero [1] ptr mem) -> (MOVBstore ptr (MOVBconst) mem)
 (Zero [2] ptr mem) -> (MOVHstore ptr (MOVHconst) mem)
 (Zero [4] ptr mem) -> (MOVWstore ptr (MOVWconst) mem)
 (Zero [8] ptr mem) -> (MOVDstore ptr (MOVDconst) mem)
 // Generic zeroing uses a loop
 (Zero [s] {t} ptr mem) ->
 	(LoweredZero [t.(*types.Type).Alignment()]
 		ptr
 		(ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.(*types.Type).Alignment(), config)]))
 		mem)
 (Convert x mem) -> (MOVconvert x mem)
 // Checks
 (IsNonNil p) -> (NeqPtr (MOVDconst) p)
 (IsInBounds idx len) -> (Less64U idx len)
 (IsSliceInBounds idx len) -> (Leq64U idx len)
 // Trivial lowering
 (NilCheck ptr mem) -> (LoweredNilCheck ptr mem)
 (GetClosurePtr) -> (LoweredGetClosurePtr)
 (GetCallerSP) -> (LoweredGetCallerSP)
 (GetCallerPC) -> (LoweredGetCallerPC)
 // Write barrier.
 (WB {fn} destptr srcptr mem) -> (LoweredWB {fn} destptr srcptr mem)
 (PanicBounds [kind] x y mem) && boundsABI(kind) == 0 -> (LoweredPanicBoundsA [kind] x y mem)
 (PanicBounds [kind] x y mem) && boundsABI(kind) == 1 -> (LoweredPanicBoundsB [kind] x y mem)
 (PanicBounds [kind] x y mem) && boundsABI(kind) == 2 -> (LoweredPanicBoundsC [kind] x y mem)
 // Moves
 // TODO: more optimized moves, including attempting to use aligned accesses.
 (Move [0]   _   _ mem) -> mem
 (Move [1] dst src mem) -> (MOVBstore dst (MOVBload src mem) mem)
 (Move [2] dst src mem) -> (MOVHstore dst (MOVHload src mem) mem)
 (Move [4] dst src mem) -> (MOVWstore dst (MOVWload src mem) mem)
 (Move [8] dst src mem) -> (MOVDstore dst (MOVDload src mem) mem)
 // Generic move uses a loop
 (Move [s] {t} dst src mem) ->
 	(LoweredMove [t.(*types.Type).Alignment()]
 		dst
 		src
 		(ADDI <src.Type> [s-moveSize(t.(*types.Type).Alignment(), config)] src)
 		mem)
 // Boolean ops; 0=false, 1=true
 (AndB x y) -> (AND  x y)
 (OrB  x y) -> (OR   x y)
 (EqB  x y) -> (XORI [1] (XOR <typ.Bool> x y))
 (NeqB x y) -> (XOR  x y)
 (Not  x)   -> (XORI [1] x)
 // Lowering pointer arithmetic
 // TODO: Special handling for SP offsets, like ARM
 (OffPtr [off] ptr:(SP)) -> (MOVaddr [off] ptr)
 (OffPtr [off] ptr) && is32Bit(off) -> (ADDI [off] ptr)
 (OffPtr [off] ptr) -> (ADD (MOVDconst [off]) ptr)
 (Const8 [val]) -> (MOVBconst [val])
 (Const16 [val]) -> (MOVHconst [val])
 (Const32 [val]) -> (MOVWconst [val])
 (Const64 [val]) -> (MOVDconst [val])
 (Const32F [val]) -> (FMVSX (MOVWconst [int64(int32(math.Float32bits(float32(math.Float64frombits(uint64(val))))))]))
 (Const64F [val]) -> (FMVDX (MOVDconst [val]))
 (ConstNil) -> (MOVDconst [0])
 (ConstBool [b]) -> (MOVBconst [b])
 // Convert 64 bit immediate to two 32 bit immediates, combine with add and shift.
 // The lower 32 bit immediate will be treated as signed,
 // so if it is negative, adjust for the borrow by incrementing the top half.
 // We don't have to worry about overflow from the increment,
 // because if the top half is all 1s, and int32(c) is negative,
 // then the overall constant fits in an int32.
 (MOVDconst <t> [c]) && !is32Bit(c) && int32(c) <  0 -> (ADD (SLLI <t> [32] (MOVDconst [c>>32+1])) (MOVDconst [int64(int32(c))]))
 (MOVDconst <t> [c]) && !is32Bit(c) && int32(c) >= 0 -> (ADD (SLLI <t> [32] (MOVDconst [c>>32+0])) (MOVDconst [int64(int32(c))]))
 // Fold ADD+MOVDconst into ADDI where possible.
 (ADD (MOVDconst [off]) ptr) && is32Bit(off) -> (ADDI [off] ptr)
 (Addr {sym} base) -> (MOVaddr {sym} base)
 (LocalAddr {sym} base _) -> (MOVaddr {sym} base)
 // Conditional branches
 //
 // cond is 1 if true. BNE compares against 0.
 //
 // TODO(prattmic): RISCV branch instructions take two operands to compare,
 // so we could generate more efficient code by computing the condition in the
 // branch itself. This should be revisited now that the compiler has support
 // for two control values (https://golang.org/cl/196557).
 (If cond yes no) -> (BNE cond yes no)
 // Calls
 (StaticCall  [argwid] {target}      mem) -> (CALLstatic  [argwid] {target}      mem)
 (ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)
 (InterCall   [argwid] entry         mem) -> (CALLinter   [argwid] entry         mem)
 // remove redundant *const ops
 (ADDI [0]  x) -> x
--- a/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
@ -0,0 +1,312 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build ignore
 package main
 import "cmd/internal/obj/riscv"
 // Suffixes encode the bit width of various instructions:
 //
 // D (double word) = 64 bit int
 // W (word)        = 32 bit int
 // H (half word)   = 16 bit int
 // B (byte)        = 8 bit int
 // S (single)      = 32 bit float
 // D (double)      = 64 bit float
 // L               = 64 bit int, used when the opcode starts with F
 func init() {
 	var regNamesRISCV64 []string
 	var gpMask, fpMask, gpspMask, gpspsbMask regMask
 	regNamed := make(map[string]regMask)
 	// Build the list of register names, creating an appropriately indexed
 	// regMask for the gp and fp registers as we go.
 	//
 	// If name is specified, use it rather than the riscv reg number.
 	addreg := func(r int, name string) regMask {
 		mask := regMask(1) << uint(len(regNamesRISCV64))
 		if name == "" {
 			name = riscv.RegName(r)
 		}
 		regNamesRISCV64 = append(regNamesRISCV64, name)
 		regNamed[name] = mask
 		return mask
 	}
 	// General purpose registers.
 	for r := riscv.REG_X0; r <= riscv.REG_X31; r++ {
 		if r == riscv.REG_LR {
 			// LR is not used by regalloc, so we skip it to leave
 			// room for pseudo-register SB.
 			continue
 		}
 		mask := addreg(r, "")
 		// Add general purpose registers to gpMask.
 		switch r {
 		// ZERO, g, and TMP are not in any gp mask.
 		case riscv.REG_ZERO, riscv.REG_G, riscv.REG_TMP:
 		case riscv.REG_SP:
 			gpspMask |= mask
 			gpspsbMask |= mask
 		default:
 			gpMask |= mask
 			gpspMask |= mask
 			gpspsbMask |= mask
 		}
 	}
 	// Floating pointer registers.
 	for r := riscv.REG_F0; r <= riscv.REG_F31; r++ {
 		mask := addreg(r, "")
 		fpMask |= mask
 	}
 	// Pseudo-register: SB
 	mask := addreg(-1, "SB")
 	gpspsbMask |= mask
 	if len(regNamesRISCV64) > 64 {
 		// regMask is only 64 bits.
 		panic("Too many RISCV64 registers")
 	}
 	regCtxt := regNamed["X20"]
 	callerSave := gpMask | fpMask | regNamed["g"]
 	var (
 		gpstore = regInfo{inputs: []regMask{gpspsbMask, gpspMask, 0}} // SB in first input so we can load from a global, but not in second to avoid using SB as a temporary register
 		gp01    = regInfo{outputs: []regMask{gpMask}}
 		gp11    = regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}
 		gp21    = regInfo{inputs: []regMask{gpMask, gpMask}, outputs: []regMask{gpMask}}
 		gpload  = regInfo{inputs: []regMask{gpspsbMask, 0}, outputs: []regMask{gpMask}}
 		gp11sb  = regInfo{inputs: []regMask{gpspsbMask}, outputs: []regMask{gpMask}}
 		fp11    = regInfo{inputs: []regMask{fpMask}, outputs: []regMask{fpMask}}
 		fp21    = regInfo{inputs: []regMask{fpMask, fpMask}, outputs: []regMask{fpMask}}
 		gpfp    = regInfo{inputs: []regMask{gpMask}, outputs: []regMask{fpMask}}
 		fpgp    = regInfo{inputs: []regMask{fpMask}, outputs: []regMask{gpMask}}
 		fpstore = regInfo{inputs: []regMask{gpspsbMask, fpMask, 0}}
 		fpload  = regInfo{inputs: []regMask{gpspsbMask, 0}, outputs: []regMask{fpMask}}
 		fp2gp   = regInfo{inputs: []regMask{fpMask, fpMask}, outputs: []regMask{gpMask}}
 		call        = regInfo{clobbers: callerSave}
 		callClosure = regInfo{inputs: []regMask{gpspMask, regCtxt, 0}, clobbers: callerSave}
 		callInter   = regInfo{inputs: []regMask{gpMask}, clobbers: callerSave}
 	)
 	RISCV64ops := []opData{
 		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1
 		{name: "ADDI", argLength: 1, reg: gp11sb, asm: "ADDI", aux: "Int64"},  // arg0 + auxint
 		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                    // arg0 - arg1
 		// M extension. H means high (i.e., it returns the top bits of
 		// the result). U means unsigned. W means word (i.e., 32-bit).
 		{name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true, typ: "Int64"}, // arg0 * arg1
 		{name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true, typ: "Int32"},
 		{name: "MULH", argLength: 2, reg: gp21, asm: "MULH", commutative: true, typ: "Int64"},
 		{name: "MULHU", argLength: 2, reg: gp21, asm: "MULHU", commutative: true, typ: "UInt64"},
 		{name: "DIV", argLength: 2, reg: gp21, asm: "DIV", typ: "Int64"}, // arg0 / arg1
 		{name: "DIVU", argLength: 2, reg: gp21, asm: "DIVU", typ: "UInt64"},
 		{name: "DIVW", argLength: 2, reg: gp21, asm: "DIVW", typ: "Int32"},
 		{name: "DIVUW", argLength: 2, reg: gp21, asm: "DIVUW", typ: "UInt32"},
 		{name: "REM", argLength: 2, reg: gp21, asm: "REM", typ: "Int64"}, // arg0 % arg1
 		{name: "REMU", argLength: 2, reg: gp21, asm: "REMU", typ: "UInt64"},
 		{name: "REMW", argLength: 2, reg: gp21, asm: "REMW", typ: "Int32"},
 		{name: "REMUW", argLength: 2, reg: gp21, asm: "REMUW", typ: "UInt32"},
 		{name: "MOVaddr", argLength: 1, reg: gp11sb, asm: "MOV", aux: "SymOff", rematerializeable: true, symEffect: "RdWr"}, // arg0 + auxint + offset encoded in aux
 		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
 		{name: "MOVBconst", reg: gp01, asm: "MOV", typ: "UInt8", aux: "Int8", rematerializeable: true},   // 8 low bits of auxint
 		{name: "MOVHconst", reg: gp01, asm: "MOV", typ: "UInt16", aux: "Int16", rematerializeable: true}, // 16 low bits of auxint
 		{name: "MOVWconst", reg: gp01, asm: "MOV", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
 		{name: "MOVDconst", reg: gp01, asm: "MOV", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
 		// Loads: load <size> bits from arg0+auxint+aux and extend to 64 bits; arg1=mem
 		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"},     //  8 bits, sign extend
 		{name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},    // 16 bits, sign extend
 		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"},    // 32 bits, sign extend
 		{name: "MOVDload", argLength: 2, reg: gpload, asm: "MOV", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"},     // 64 bits
 		{name: "MOVBUload", argLength: 2, reg: gpload, asm: "MOVBU", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  //  8 bits, zero extend
 		{name: "MOVHUload", argLength: 2, reg: gpload, asm: "MOVHU", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // 16 bits, zero extend
 		{name: "MOVWUload", argLength: 2, reg: gpload, asm: "MOVWU", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // 32 bits, zero extend
 		// Stores: store <size> lowest bits in arg1 to arg0+auxint+aux; arg2=mem
 		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, //  8 bits
 		{name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 16 bits
 		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 32 bits
 		{name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOV", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // 64 bits
 		// Shift ops
 		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                 // arg0 << aux1
 		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                 // arg0 >> aux1, signed
 		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                 // arg0 >> aux1, unsigned
 		{name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint
 		{name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed
 		{name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned
 		// Bitwise ops
 		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
 		{name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"},    // arg0 ^ auxint
 		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},   // arg0 | arg1
 		{name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"},      // arg0 | auxint
 		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
 		{name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"},    // arg0 & auxint
 		// Generate boolean values
 		{name: "SEQZ", argLength: 1, reg: gp11, asm: "SEQZ"},                 // arg0 == 0, result is 0 or 1
 		{name: "SNEZ", argLength: 1, reg: gp11, asm: "SNEZ"},                 // arg0 != 0, result is 0 or 1
 		{name: "SLT", argLength: 2, reg: gp21, asm: "SLT"},                   // arg0 < arg1, result is 0 or 1
 		{name: "SLTI", argLength: 1, reg: gp11, asm: "SLTI", aux: "Int64"},   // arg0 < auxint, result is 0 or 1
 		{name: "SLTU", argLength: 2, reg: gp21, asm: "SLTU"},                 // arg0 < arg1, unsigned, result is 0 or 1
 		{name: "SLTIU", argLength: 1, reg: gp11, asm: "SLTIU", aux: "Int64"}, // arg0 < auxint, unsigned, result is 0 or 1
 		// MOVconvert converts between pointers and integers.
 		// We have a special op for this so as to not confuse GC
 		// (particularly stack maps). It takes a memory arg so it
 		// gets correctly ordered with respect to GC safepoints.
 		{name: "MOVconvert", argLength: 2, reg: gp11, asm: "MOV"}, // arg0, but converted to int/ptr as appropriate; arg1=mem
 		// Calls
 		{name: "CALLstatic", argLength: 1, reg: call, aux: "SymOff", call: true, symEffect: "None"}, // call static function aux.(*gc.Sym). arg0=mem, auxint=argsize, returns mem
 		{name: "CALLclosure", argLength: 3, reg: callClosure, aux: "Int64", call: true},             // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
 		{name: "CALLinter", argLength: 2, reg: callInter, aux: "Int64", call: true},                 // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
 		// Generic moves and zeros
 		// general unaligned zeroing
 		// arg0 = address of memory to zero (in X5, changed as side effect)
 		// arg1 = address of the last element to zero (inclusive)
 		// arg2 = mem
 		// auxint = element size
 		// returns mem
 		//	mov	ZERO, (X5)
 		//	ADD	$sz, X5
 		//	BGEU	Rarg1, X5, -2(PC)
 		{
 			name:      "LoweredZero",
 			aux:       "Int64",
 			argLength: 3,
 			reg: regInfo{
 				inputs:   []regMask{regNamed["X5"], gpMask},
 				clobbers: regNamed["X5"],
 			},
 			typ:            "Mem",
 			faultOnNilArg0: true,
 		},
 		// general unaligned move
 		// arg0 = address of dst memory (in X5, changed as side effect)
 		// arg1 = address of src memory (in X6, changed as side effect)
 		// arg2 = address of the last element of src (can't be X7 as we clobber it before using arg2)
 		// arg3 = mem
 		// auxint = alignment
 		// clobbers X7 as a tmp register.
 		// returns mem
 		//	mov	(X6), X7
 		//	mov	X7, (X5)
 		//	ADD	$sz, X5
 		//	ADD	$sz, X6
 		//	BGEU	Rarg2, X5, -4(PC)
 		{
 			name:      "LoweredMove",
 			aux:       "Int64",
 			argLength: 4,
 			reg: regInfo{
 				inputs:   []regMask{regNamed["X5"], regNamed["X6"], gpMask &^ regNamed["X7"]},
 				clobbers: regNamed["X5"] | regNamed["X6"] | regNamed["X7"],
 			},
 			typ:            "Mem",
 			faultOnNilArg0: true,
 			faultOnNilArg1: true,
 		},
 		// Lowering pass-throughs
 		{name: "LoweredNilCheck", argLength: 2, faultOnNilArg0: true, nilCheck: true, reg: regInfo{inputs: []regMask{gpspMask}}}, // arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
 		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{regCtxt}}},                                                // scheduler ensures only at beginning of entry block
 		// LoweredGetCallerSP returns the SP of the caller of the current function.
 		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
 		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
 		// I.e., if f calls g "calls" getcallerpc,
 		// the result should be the PC within f that g will return to.
 		// See runtime/stubs.go for a more detailed discussion.
 		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
 		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
 		// It saves all GP registers if necessary,
 		// but clobbers RA (LR) because it's a call
 		// and T6 (REG_TMP).
 		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{regNamed["X5"], regNamed["X6"]}, clobbers: (callerSave &^ (gpMask | regNamed["g"])) | regNamed["X1"]}, clobberFlags: true, aux: "Sym", symEffect: "None"},
 		// There are three of these functions so that they can have three different register inputs.
 		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
 		// default registers to match so we don't need to copy registers around unnecessarily.
 		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X7"], regNamed["X28"]}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
 		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X6"], regNamed["X7"]}}, typ: "Mem"},  // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
 		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X5"], regNamed["X6"]}}, typ: "Mem"},  // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
 		// F extension.
 		{name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true, typ: "Float32"},                                           // arg0 + arg1
 		{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS", commutative: false, typ: "Float32"},                                          // arg0 - arg1
 		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, typ: "Float32"},                                           // arg0 * arg1
 		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", commutative: false, typ: "Float32"},                                          // arg0 / arg1
 		{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"},                                                            // sqrt(arg0)
 		{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"},                                                              // -arg0
 		{name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"},                                                              // reinterpret arg0 as float
 		{name: "FCVTSW", argLength: 1, reg: gpfp, asm: "FCVTSW", typ: "Float32"},                                                            // float32(low 32 bits of arg0)
 		{name: "FCVTSL", argLength: 1, reg: gpfp, asm: "FCVTSL", typ: "Float32"},                                                            // float32(arg0)
 		{name: "FCVTWS", argLength: 1, reg: fpgp, asm: "FCVTWS", typ: "Int32"},                                                              // int32(arg0)
 		{name: "FCVTLS", argLength: 1, reg: fpgp, asm: "FCVTLS", typ: "Int64"},                                                              // int64(arg0)
 		{name: "FMOVWload", argLength: 2, reg: fpload, asm: "MOVF", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load float32 from arg0+auxint+aux
 		{name: "FMOVWstore", argLength: 3, reg: fpstore, asm: "MOVF", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // store float32 to arg0+auxint+aux
 		{name: "FEQS", argLength: 2, reg: fp2gp, asm: "FEQS", commutative: true},                                                            // arg0 == arg1
 		{name: "FNES", argLength: 2, reg: fp2gp, asm: "FNES", commutative: true},                                                            // arg0 != arg1
 		{name: "FLTS", argLength: 2, reg: fp2gp, asm: "FLTS"},                                                                               // arg0 < arg1
 		{name: "FLES", argLength: 2, reg: fp2gp, asm: "FLES"},                                                                               // arg0 <= arg1
 		// D extension.
 		{name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true, typ: "Float64"},                                           // arg0 + arg1
 		{name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD", commutative: false, typ: "Float64"},                                          // arg0 - arg1
 		{name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true, typ: "Float64"},                                           // arg0 * arg1
 		{name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD", commutative: false, typ: "Float64"},                                          // arg0 / arg1
 		{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD", typ: "Float64"},                                                            // sqrt(arg0)
 		{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD", typ: "Float64"},                                                              // -arg0
 		{name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"},                                                              // reinterpret arg0 as float
 		{name: "FCVTDW", argLength: 1, reg: gpfp, asm: "FCVTDW", typ: "Float64"},                                                            // float64(low 32 bits of arg0)
 		{name: "FCVTDL", argLength: 1, reg: gpfp, asm: "FCVTDL", typ: "Float64"},                                                            // float64(arg0)
 		{name: "FCVTWD", argLength: 1, reg: fpgp, asm: "FCVTWD", typ: "Int32"},                                                              // int32(arg0)
 		{name: "FCVTLD", argLength: 1, reg: fpgp, asm: "FCVTLD", typ: "Int64"},                                                              // int64(arg0)
 		{name: "FCVTDS", argLength: 1, reg: fp11, asm: "FCVTDS", typ: "Float64"},                                                            // float64(arg0)
 		{name: "FCVTSD", argLength: 1, reg: fp11, asm: "FCVTSD", typ: "Float32"},                                                            // float32(arg0)
 		{name: "FMOVDload", argLength: 2, reg: fpload, asm: "MOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load float64 from arg0+auxint+aux
 		{name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // store float6 to arg0+auxint+aux
 		{name: "FEQD", argLength: 2, reg: fp2gp, asm: "FEQD", commutative: true},                                                            // arg0 == arg1
 		{name: "FNED", argLength: 2, reg: fp2gp, asm: "FNED", commutative: true},                                                            // arg0 != arg1
 		{name: "FLTD", argLength: 2, reg: fp2gp, asm: "FLTD"},                                                                               // arg0 < arg1
 		{name: "FLED", argLength: 2, reg: fp2gp, asm: "FLED"},                                                                               // arg0 <= arg1
 	}
 	RISCV64blocks := []blockData{
 		{name: "BNE", controls: 1}, // Control != 0 (take a register)
 	}
 	archs = append(archs, arch{
 		name:            "RISCV64",
 		pkg:             "cmd/internal/obj/riscv",
 		genfile:         "../../riscv64/ssa.go",
 		ops:             RISCV64ops,
 		blocks:          RISCV64blocks,
 		regnames:        regNamesRISCV64,
 		gpregmask:       gpMask,
 		fpregmask:       fpMask,
 		framepointerreg: -1, // not used
 	})
 }
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@ -66,7 +66,7 @@ func (op Op) isLoweredGetClosurePtr() bool {
 	switch op {
 	case OpAMD64LoweredGetClosurePtr, OpPPC64LoweredGetClosurePtr, OpARMLoweredGetClosurePtr, OpARM64LoweredGetClosurePtr,
 		Op386LoweredGetClosurePtr, OpMIPS64LoweredGetClosurePtr, OpS390XLoweredGetClosurePtr, OpMIPSLoweredGetClosurePtr,
-		OpWasmLoweredGetClosurePtr:
+		OpRISCV64LoweredGetClosurePtr, OpWasmLoweredGetClosurePtr:
 		return true
 	}
 	return false
@ -115,7 +115,7 @@ func schedule(f *Func) {
 				v.Op == OpARMLoweredNilCheck || v.Op == OpARM64LoweredNilCheck ||
 				v.Op == Op386LoweredNilCheck || v.Op == OpMIPS64LoweredNilCheck ||
 				v.Op == OpS390XLoweredNilCheck || v.Op == OpMIPSLoweredNilCheck ||
-				v.Op == OpWasmLoweredNilCheck:
+				v.Op == OpRISCV64LoweredNilCheck || v.Op == OpWasmLoweredNilCheck:
 				// Nil checks must come before loads from the same address.
 				score[v.ID] = ScoreNilCheck
 			case v.Op == OpPhi:
--- a/src/cmd/compile/main.go
+++ b/src/cmd/compile/main.go
@ -12,6 +12,7 @@ import (
 	"cmd/compile/internal/mips"
 	"cmd/compile/internal/mips64"
 	"cmd/compile/internal/ppc64"
 	"cmd/compile/internal/riscv64"
 	"cmd/compile/internal/s390x"
 	"cmd/compile/internal/wasm"
 	"cmd/compile/internal/x86"
@ -32,6 +33,7 @@ var archInits = map[string]func(*gc.Arch){
 	"mips64le": mips64.Init,
 	"ppc64":    ppc64.Init,
 	"ppc64le":  ppc64.Init,
 	"riscv64":  riscv64.Init,
 	"s390x":    s390x.Init,
 	"wasm":     wasm.Init,
 }
--- a/src/cmd/dist/buildtool.go
+++ b/src/cmd/dist/buildtool.go
@ -45,10 +45,11 @@ var bootstrapDirs = []string{
 	"cmd/compile/internal/mips",
 	"cmd/compile/internal/mips64",
 	"cmd/compile/internal/ppc64",
-	"cmd/compile/internal/types",
+	"cmd/compile/internal/riscv64",
 	"cmd/compile/internal/s390x",
 	"cmd/compile/internal/ssa",
 	"cmd/compile/internal/syntax",
 	"cmd/compile/internal/types",
 	"cmd/compile/internal/x86",
 	"cmd/compile/internal/wasm",
 	"cmd/internal/bio",
--- a/src/cmd/internal/obj/riscv/list.go
+++ b/src/cmd/internal/obj/riscv/list.go
@ -11,11 +11,11 @@ import (
 )
 func init() {
-	obj.RegisterRegister(obj.RBaseRISCV, REG_END, regName)
+	obj.RegisterRegister(obj.RBaseRISCV, REG_END, RegName)
 	obj.RegisterOpcode(obj.ABaseRISCV, Anames)
 }
-func regName(r int) string {
+func RegName(r int) string {
 	switch {
 	case r == 0:
 		return "NONE"
--- a/src/cmd/internal/obj/riscv/obj.go
+++ b/src/cmd/internal/obj/riscv/obj.go
@ -487,8 +487,8 @@ func rewriteMOV(ctxt *obj.Link, newprog obj.ProgAlloc, p *obj.Prog) {
 	}
 }
-// invertBranch inverts the condition of a conditional branch.
+// InvertBranch inverts the condition of a conditional branch.
-func invertBranch(i obj.As) obj.As {
+func InvertBranch(i obj.As) obj.As {
 	switch i {
 	case ABEQ:
 		return ABNE
@ -503,7 +503,7 @@ func invertBranch(i obj.As) obj.As {
 	case ABGEU:
 		return ABLTU
 	default:
-		panic("invertBranch: not a branch")
+		panic("InvertBranch: not a branch")
 	}
 }
@ -800,7 +800,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 					jmp.To = obj.Addr{Type: obj.TYPE_BRANCH}
 					jmp.Pcond = p.Pcond
-					p.As = invertBranch(p.As)
+					p.As = InvertBranch(p.As)
 					p.Pcond = jmp.Link
 					// We may have made previous branches too long,
@ -1005,7 +1005,7 @@ func wantImmU(p *obj.Prog, pos string, a obj.Addr, nbits uint) {
 func wantReg(p *obj.Prog, pos string, descr string, r, min, max int16) {
 	if r < min || r > max {
-		p.Ctxt.Diag("%v\texpected %s register in %s position but got non-%s register %s", p, descr, pos, descr, regName(int(r)))
+		p.Ctxt.Diag("%v\texpected %s register in %s position but got non-%s register %s", p, descr, pos, descr, RegName(int(r)))
 	}
 }