diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go
index 1dbd68cd67..760bb7a999 100644
--- a/src/cmd/compile/internal/test/inl_test.go
+++ b/src/cmd/compile/internal/test/inl_test.go
@@ -175,9 +175,6 @@ func TestIntendedInlining(t *testing.T) {
 		},
 		"math/big": {
 			"bigEndianWord",
-			// The following functions require the math_big_pure_go build tag.
-			"addVW",
-			"subVW",
 		},
 		"math/rand": {
 			"(*rngSource).Int63",
diff --git a/src/math/big/arith.go b/src/math/big/arith.go
index cd2b8a4228..e2cd99f602 100644
--- a/src/math/big/arith.go
+++ b/src/math/big/arith.go
@@ -10,7 +10,10 @@
 
 package big
 
-import "math/bits"
+import (
+	"math/bits"
+	_ "unsafe" // for go:linkname
+)
 
 // A Word represents a single digit of a multi-precision unsigned integer.
 type Word uint
@@ -82,11 +85,50 @@ func subVV_g(z, x, y []Word) (c Word) {
 	return
 }
 
-// The resulting carry c is either 0 or 1.
-func addVW_g(z, x []Word, y Word) (c Word) {
+// addVW sets z = x + y, returning the final carry c.
+// The behavior is undefined if len(x) != len(z).
+// If len(z) == 0, c = y; otherwise, c is 0 or 1.
+//
+// addVW should be an internal detail,
+// but widely used packages access it using linkname.
+// Notable members of the hall of shame include:
+//   - github.com/remyoudompheng/bigfft
+//
+// Do not remove or change the type signature.
+// See go.dev/issue/67401.
+//
+//go:linkname addVW
+func addVW(z, x []Word, y Word) (c Word) {
+	x = x[:len(z)]
+	if len(z) == 0 {
+		return y
+	}
+	zi, cc := bits.Add(uint(x[0]), uint(y), 0)
+	z[0] = Word(zi)
+	if cc == 0 {
+		if &z[0] != &x[0] {
+			copy(z[1:], x[1:])
+		}
+		return 0
+	}
+	for i := 1; i < len(z); i++ {
+		xi := x[i]
+		if xi != ^Word(0) {
+			z[i] = xi + 1
+			if &z[0] != &x[0] {
+				copy(z[i+1:], x[i+1:])
+			}
+			return 0
+		}
+		z[i] = 0
+	}
+	return 1
+}
+
+// addVW_ref is the reference implementation for addVW, used only for testing.
+func addVW_ref(z, x []Word, y Word) (c Word) {
 	c = y
-	// The comment near the top of this file discusses this for loop condition.
-	for i := 0; i < len(z) && i < len(x); i++ {
+	for i := range z {
 		zi, cc := bits.Add(uint(x[i]), uint(c), 0)
 		z[i] = Word(zi)
 		c = Word(cc)
@@ -94,53 +136,55 @@ func addVW_g(z, x []Word, y Word) (c Word) {
 	return
 }
 
-// addVWlarge is addVW, but intended for large z.
-// The only difference is that we check on every iteration
-// whether we are done with carries,
-// and if so, switch to a much faster copy instead.
-// This is only a good idea for large z,
-// because the overhead of the check and the function call
-// outweigh the benefits when z is small.
-func addVWlarge(z, x []Word, y Word) (c Word) {
-	c = y
-	// The comment near the top of this file discusses this for loop condition.
-	for i := 0; i < len(z) && i < len(x); i++ {
-		if c == 0 {
-			copy(z[i:], x[i:])
-			return
-		}
-		zi, cc := bits.Add(uint(x[i]), uint(c), 0)
-		z[i] = Word(zi)
-		c = Word(cc)
+// subVW sets z = x - y, returning the final carry c.
+// The behavior is undefined if len(x) != len(z).
+// If len(z) == 0, c = y; otherwise, c is 0 or 1.
+//
+// subVW should be an internal detail,
+// but widely used packages access it using linkname.
+// Notable members of the hall of shame include:
+//   - github.com/remyoudompheng/bigfft
+//
+// Do not remove or change the type signature.
+// See go.dev/issue/67401.
+//
+//go:linkname subVW
+func subVW(z, x []Word, y Word) (c Word) {
+	x = x[:len(z)]
+	if len(z) == 0 {
+		return y
 	}
-	return
+	zi, cc := bits.Sub(uint(x[0]), uint(y), 0)
+	z[0] = Word(zi)
+	if cc == 0 {
+		if &z[0] != &x[0] {
+			copy(z[1:], x[1:])
+		}
+		return 0
+	}
+	for i := 1; i < len(z); i++ {
+		xi := x[i]
+		if xi != 0 {
+			z[i] = xi - 1
+			if &z[0] != &x[0] {
+				copy(z[i+1:], x[i+1:])
+			}
+			return 0
+		}
+		z[i] = ^Word(0)
+	}
+	return 1
 }
 
-func subVW_g(z, x []Word, y Word) (c Word) {
+// subVW_ref is the reference implementation for subVW, used only for testing.
+func subVW_ref(z, x []Word, y Word) (c Word) {
 	c = y
-	// The comment near the top of this file discusses this for loop condition.
-	for i := 0; i < len(z) && i < len(x); i++ {
+	for i := range z {
 		zi, cc := bits.Sub(uint(x[i]), uint(c), 0)
 		z[i] = Word(zi)
 		c = Word(cc)
 	}
-	return
-}
-
-// subVWlarge is to subVW as addVWlarge is to addVW.
-func subVWlarge(z, x []Word, y Word) (c Word) {
-	c = y
-	// The comment near the top of this file discusses this for loop condition.
-	for i := 0; i < len(z) && i < len(x); i++ {
-		if c == 0 {
-			copy(z[i:], x[i:])
-			return
-		}
-		zi, cc := bits.Sub(uint(x[i]), uint(c), 0)
-		z[i] = Word(zi)
-		c = Word(cc)
-	}
-	return
+	return c
 }
 
 func lshVU_g(z, x []Word, s uint) (c Word) {
diff --git a/src/math/big/arith_386.s b/src/math/big/arith_386.s
index c3567c632d..a989503c1c 100644
--- a/src/math/big/arith_386.s
+++ b/src/math/big/arith_386.s
@@ -60,51 +60,6 @@ E2:	CMPL BX, BP		// i < n
 	RET
 
 
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB),NOSPLIT,$0
-	MOVL z+0(FP), DI
-	MOVL x+12(FP), SI
-	MOVL y+24(FP), AX	// c = y
-	MOVL z_len+4(FP), BP
-	MOVL $0, BX		// i = 0
-	JMP E3
-
-L3:	ADDL (SI)(BX*4), AX
-	MOVL AX, (DI)(BX*4)
-	SBBL AX, AX		// save CF
-	NEGL AX
-	ADDL $1, BX		// i++
-
-E3:	CMPL BX, BP		// i < n
-	JL L3
-
-	MOVL AX, c+28(FP)
-	RET
-
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW(SB),NOSPLIT,$0
-	MOVL z+0(FP), DI
-	MOVL x+12(FP), SI
-	MOVL y+24(FP), AX	// c = y
-	MOVL z_len+4(FP), BP
-	MOVL $0, BX		// i = 0
-	JMP E4
-
-L4:	MOVL (SI)(BX*4), DX
-	SUBL AX, DX
-	MOVL DX, (DI)(BX*4)
-	SBBL AX, AX		// save CF
-	NEGL AX
-	ADDL $1, BX		// i++
-
-E4:	CMPL BX, BP		// i < n
-	JL L4
-
-	MOVL AX, c+28(FP)
-	RET
-
-
 // func lshVU(z, x []Word, s uint) (c Word)
 TEXT ·lshVU(SB),NOSPLIT,$0
 	MOVL z_len+4(FP), BX	// i = z
diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s
index 2e1d68f935..66bc6d41ce 100644
--- a/src/math/big/arith_amd64.s
+++ b/src/math/big/arith_amd64.s
@@ -121,119 +121,6 @@ E2:	NEGQ CX
 	MOVQ CX, c+72(FP)	// return c
 	RET
 
-
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB),NOSPLIT,$0
-	MOVQ z_len+8(FP), DI
-	CMPQ DI, $32
-	JG large
-	MOVQ x+24(FP), R8
-	MOVQ y+48(FP), CX	// c = y
-	MOVQ z+0(FP), R10
-
-	MOVQ $0, SI		// i = 0
-
-	// s/JL/JMP/ below to disable the unrolled loop
-	SUBQ $4, DI		// n -= 4
-	JL V3			// if n < 4 goto V3
-
-U3:	// n >= 0
-	// regular loop body unrolled 4x
-	MOVQ 0(R8)(SI*8), R11
-	MOVQ 8(R8)(SI*8), R12
-	MOVQ 16(R8)(SI*8), R13
-	MOVQ 24(R8)(SI*8), R14
-	ADDQ CX, R11
-	ADCQ $0, R12
-	ADCQ $0, R13
-	ADCQ $0, R14
-	SBBQ CX, CX		// save CF
-	NEGQ CX
-	MOVQ R11, 0(R10)(SI*8)
-	MOVQ R12, 8(R10)(SI*8)
-	MOVQ R13, 16(R10)(SI*8)
-	MOVQ R14, 24(R10)(SI*8)
-
-	ADDQ $4, SI		// i += 4
-	SUBQ $4, DI		// n -= 4
-	JGE U3			// if n >= 0 goto U3
-
-V3:	ADDQ $4, DI		// n += 4
-	JLE E3			// if n <= 0 goto E3
-
-L3:	// n > 0
-	ADDQ 0(R8)(SI*8), CX
-	MOVQ CX, 0(R10)(SI*8)
-	SBBQ CX, CX		// save CF
-	NEGQ CX
-
-	ADDQ $1, SI		// i++
-	SUBQ $1, DI		// n--
-	JG L3			// if n > 0 goto L3
-
-E3:	MOVQ CX, c+56(FP)	// return c
-	RET
-large:
-	JMP ·addVWlarge(SB)
-
-
-// func subVW(z, x []Word, y Word) (c Word)
-// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
-TEXT ·subVW(SB),NOSPLIT,$0
-	MOVQ z_len+8(FP), DI
-	CMPQ DI, $32
-	JG large
-	MOVQ x+24(FP), R8
-	MOVQ y+48(FP), CX	// c = y
-	MOVQ z+0(FP), R10
-
-	MOVQ $0, SI		// i = 0
-
-	// s/JL/JMP/ below to disable the unrolled loop
-	SUBQ $4, DI		// n -= 4
-	JL V4			// if n < 4 goto V4
-
-U4:	// n >= 0
-	// regular loop body unrolled 4x
-	MOVQ 0(R8)(SI*8), R11
-	MOVQ 8(R8)(SI*8), R12
-	MOVQ 16(R8)(SI*8), R13
-	MOVQ 24(R8)(SI*8), R14
-	SUBQ CX, R11
-	SBBQ $0, R12
-	SBBQ $0, R13
-	SBBQ $0, R14
-	SBBQ CX, CX		// save CF
-	NEGQ CX
-	MOVQ R11, 0(R10)(SI*8)
-	MOVQ R12, 8(R10)(SI*8)
-	MOVQ R13, 16(R10)(SI*8)
-	MOVQ R14, 24(R10)(SI*8)
-
-	ADDQ $4, SI		// i += 4
-	SUBQ $4, DI		// n -= 4
-	JGE U4			// if n >= 0 goto U4
-
-V4:	ADDQ $4, DI		// n += 4
-	JLE E4			// if n <= 0 goto E4
-
-L4:	// n > 0
-	MOVQ 0(R8)(SI*8), R11
-	SUBQ CX, R11
-	MOVQ R11, 0(R10)(SI*8)
-	SBBQ CX, CX		// save CF
-	NEGQ CX
-
-	ADDQ $1, SI		// i++
-	SUBQ $1, DI		// n--
-	JG L4			// if n > 0 goto L4
-
-E4:	MOVQ CX, c+56(FP)	// return c
-	RET
-large:
-	JMP ·subVWlarge(SB)
-
-
 // func lshVU(z, x []Word, s uint) (c Word)
 TEXT ·lshVU(SB),NOSPLIT,$0
 	MOVQ z_len+8(FP), BX	// i = z
diff --git a/src/math/big/arith_arm.s b/src/math/big/arith_arm.s
index 5b04e07bd0..ce9fe5f6fb 100644
--- a/src/math/big/arith_arm.s
+++ b/src/math/big/arith_arm.s
@@ -58,66 +58,6 @@ E2:
 	RET
 
 
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB),NOSPLIT,$0
-	MOVW	z+0(FP), R1
-	MOVW	z_len+4(FP), R4
-	MOVW	x+12(FP), R2
-	MOVW	y+24(FP), R3
-	ADD	R4<<2, R1, R4
-	TEQ	R1, R4
-	BNE L3a
-	MOVW	R3, c+28(FP)
-	RET
-L3a:
-	MOVW.P	4(R2), R5
-	ADD.S	R3, R5
-	MOVW.P	R5, 4(R1)
-	B	E3
-L3:
-	MOVW.P	4(R2), R5
-	ADC.S	$0, R5
-	MOVW.P	R5, 4(R1)
-E3:
-	TEQ	R1, R4
-	BNE	L3
-
-	MOVW	$0, R0
-	MOVW.CS	$1, R0
-	MOVW	R0, c+28(FP)
-	RET
-
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW(SB),NOSPLIT,$0
-	MOVW	z+0(FP), R1
-	MOVW	z_len+4(FP), R4
-	MOVW	x+12(FP), R2
-	MOVW	y+24(FP), R3
-	ADD	R4<<2, R1, R4
-	TEQ	R1, R4
-	BNE L4a
-	MOVW	R3, c+28(FP)
-	RET
-L4a:
-	MOVW.P	4(R2), R5
-	SUB.S	R3, R5
-	MOVW.P	R5, 4(R1)
-	B	E4
-L4:
-	MOVW.P	4(R2), R5
-	SBC.S	$0, R5
-	MOVW.P	R5, 4(R1)
-E4:
-	TEQ	R1, R4
-	BNE	L4
-
-	MOVW	$0, R0
-	MOVW.CC	$1, R0
-	MOVW	R0, c+28(FP)
-	RET
-
-
 // func lshVU(z, x []Word, s uint) (c Word)
 TEXT ·lshVU(SB),NOSPLIT,$0
 	MOVW	z_len+4(FP), R5
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s
index e0a8b39e78..aa7dd6755d 100644
--- a/src/math/big/arith_arm64.s
+++ b/src/math/big/arith_arm64.s
@@ -93,164 +93,6 @@ done:
 	MOVD	R0, c+72(FP)
 	RET
 
-#define vwOneOp(instr, op1)				\
-	MOVD.P	8(R1), R4;				\
-	instr	op1, R4;				\
-	MOVD.P	R4, 8(R3);
-
-// handle the first 1~4 elements before starting iteration in addVW/subVW
-#define vwPreIter(instr1, instr2, counter, target)	\
-	vwOneOp(instr1, R2);				\
-	SUB	$1, counter;				\
-	CBZ	counter, target;			\
-	vwOneOp(instr2, $0);				\
-	SUB	$1, counter;				\
-	CBZ	counter, target;			\
-	vwOneOp(instr2, $0);				\
-	SUB	$1, counter;				\
-	CBZ	counter, target;			\
-	vwOneOp(instr2, $0);
-
-// do one iteration of add or sub in addVW/subVW
-#define vwOneIter(instr, counter, exit)	\
-	CBZ	counter, exit;		\	// careful not to touch the carry flag
-	LDP.P	32(R1), (R4, R5);	\
-	LDP	-16(R1), (R6, R7);	\
-	instr	$0, R4, R8;		\
-	instr	$0, R5, R9;		\
-	instr	$0, R6, R10;		\
-	instr	$0, R7, R11;		\
-	STP.P	(R8, R9), 32(R3);	\
-	STP	(R10, R11), -16(R3);	\
-	SUB	$4, counter;
-
-// do one iteration of copy in addVW/subVW
-#define vwOneIterCopy(counter, exit)			\
-	CBZ	counter, exit;				\
-	LDP.P	32(R1), (R4, R5);			\
-	LDP	-16(R1), (R6, R7);			\
-	STP.P	(R4, R5), 32(R3);			\
-	STP	(R6, R7), -16(R3);			\
-	SUB	$4, counter;
-
-// func addVW(z, x []Word, y Word) (c Word)
-// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
-// and switches to copy if we are done with carries. The copying is skipped as well
-// if 'x' and 'z' happen to share the same underlying storage.
-// The overhead of the checking and branching is visible when 'z' are small (~5%),
-// so set a threshold of 32, and remain the small-sized part entirely untouched.
-TEXT ·addVW(SB),NOSPLIT,$0
-	MOVD	z+0(FP), R3
-	MOVD	z_len+8(FP), R0
-	MOVD	x+24(FP), R1
-	MOVD	y+48(FP), R2
-	CMP	$32, R0
-	BGE	large		// large-sized 'z' and 'x'
-	CBZ	R0, len0	// the length of z is 0
-	MOVD.P	8(R1), R4
-	ADDS	R2, R4		// z[0] = x[0] + y, set carry
-	MOVD.P	R4, 8(R3)
-	SUB	$1, R0
-	CBZ	R0, len1	// the length of z is 1
-	TBZ	$0, R0, two
-	MOVD.P	8(R1), R4	// do it once
-	ADCS	$0, R4
-	MOVD.P	R4, 8(R3)
-	SUB	$1, R0
-two:				// do it twice
-	TBZ	$1, R0, loop
-	LDP.P	16(R1), (R4, R5)
-	ADCS	$0, R4, R8	// c, z[i] = x[i] + c
-	ADCS	$0, R5, R9
-	STP.P	(R8, R9), 16(R3)
-	SUB	$2, R0
-loop:				// do four times per round
-	vwOneIter(ADCS, R0, len1)
-	B	loop
-len1:
-	CSET	HS, R2		// extract carry flag
-len0:
-	MOVD	R2, c+56(FP)
-done:
-	RET
-large:
-	AND	$0x3, R0, R10
-	AND	$~0x3, R0
-	// unrolling for the first 1~4 elements to avoid saving the carry
-	// flag in each step, adjust $R0 if we unrolled 4 elements
-	vwPreIter(ADDS, ADCS, R10, add4)
-	SUB	$4, R0
-add4:
-	BCC	copy
-	vwOneIter(ADCS, R0, len1)
-	B	add4
-copy:
-	MOVD	ZR, c+56(FP)
-	CMP	R1, R3
-	BEQ	done
-copy_4:				// no carry flag, copy the rest
-	vwOneIterCopy(R0, done)
-	B	copy_4
-
-// func subVW(z, x []Word, y Word) (c Word)
-// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
-// and switches to copy if we are done with carries. The copying is skipped as well
-// if 'x' and 'z' happen to share the same underlying storage.
-// The overhead of the checking and branching is visible when 'z' are small (~5%),
-// so set a threshold of 32, and remain the small-sized part entirely untouched.
-TEXT ·subVW(SB),NOSPLIT,$0
-	MOVD	z+0(FP), R3
-	MOVD	z_len+8(FP), R0
-	MOVD	x+24(FP), R1
-	MOVD	y+48(FP), R2
-	CMP	$32, R0
-	BGE	large		// large-sized 'z' and 'x'
-	CBZ	R0, len0	// the length of z is 0
-	MOVD.P	8(R1), R4
-	SUBS	R2, R4		// z[0] = x[0] - y, set carry
-	MOVD.P	R4, 8(R3)
-	SUB	$1, R0
-	CBZ	R0, len1	// the length of z is 1
-	TBZ	$0, R0, two	// do it once
-	MOVD.P	8(R1), R4
-	SBCS	$0, R4
-	MOVD.P	R4, 8(R3)
-	SUB	$1, R0
-two:				// do it twice
-	TBZ	$1, R0, loop
-	LDP.P	16(R1), (R4, R5)
-	SBCS	$0, R4, R8	// c, z[i] = x[i] + c
-	SBCS	$0, R5, R9
-	STP.P	(R8, R9), 16(R3)
-	SUB	$2, R0
-loop:				// do four times per round
-	vwOneIter(SBCS, R0, len1)
-	B	loop
-len1:
-	CSET	LO, R2		// extract carry flag
-len0:
-	MOVD	R2, c+56(FP)
-done:
-	RET
-large:
-	AND	$0x3, R0, R10
-	AND	$~0x3, R0
-	// unrolling for the first 1~4 elements to avoid saving the carry
-	// flag in each step, adjust $R0 if we unrolled 4 elements
-	vwPreIter(SUBS, SBCS, R10, sub4)
-	SUB	$4, R0
-sub4:
-	BCS	copy
-	vwOneIter(SBCS, R0, len1)
-	B	sub4
-copy:
-	MOVD	ZR, c+56(FP)
-	CMP	R1, R3
-	BEQ	done
-copy_4:				// no carry flag, copy the rest
-	vwOneIterCopy(R0, done)
-	B	copy_4
-
 // func lshVU(z, x []Word, s uint) (c Word)
 // This implementation handles the shift operation from the high word to the low word,
 // which may be an error for the case where the low word of x overlaps with the high
diff --git a/src/math/big/arith_decl.go b/src/math/big/arith_decl.go
index ca73485df0..aa838808b9 100644
--- a/src/math/big/arith_decl.go
+++ b/src/math/big/arith_decl.go
@@ -34,30 +34,6 @@ func addVV(z, x, y []Word) (c Word)
 //go:noescape
 func subVV(z, x, y []Word) (c Word)
 
-// addVW should be an internal detail,
-// but widely used packages access it using linkname.
-// Notable members of the hall of shame include:
-//   - github.com/remyoudompheng/bigfft
-//
-// Do not remove or change the type signature.
-// See go.dev/issue/67401.
-//
-//go:linkname addVW
-//go:noescape
-func addVW(z, x []Word, y Word) (c Word)
-
-// subVW should be an internal detail,
-// but widely used packages access it using linkname.
-// Notable members of the hall of shame include:
-//   - github.com/remyoudompheng/bigfft
-//
-// Do not remove or change the type signature.
-// See go.dev/issue/67401.
-//
-//go:linkname subVW
-//go:noescape
-func subVW(z, x []Word, y Word) (c Word)
-
 // shlVU should be an internal detail (and a stale one at that),
 // but widely used packages access it using linkname.
 // Notable members of the hall of shame include:
diff --git a/src/math/big/arith_decl_pure.go b/src/math/big/arith_decl_pure.go
index 60672d3e6c..3b051356fb 100644
--- a/src/math/big/arith_decl_pure.go
+++ b/src/math/big/arith_decl_pure.go
@@ -14,24 +14,6 @@ func subVV(z, x, y []Word) (c Word) {
 	return subVV_g(z, x, y)
 }
 
-func addVW(z, x []Word, y Word) (c Word) {
-	// TODO: remove indirect function call when golang.org/issue/30548 is fixed
-	fn := addVW_g
-	if len(z) > 32 {
-		fn = addVWlarge
-	}
-	return fn(z, x, y)
-}
-
-func subVW(z, x []Word, y Word) (c Word) {
-	// TODO: remove indirect function call when golang.org/issue/30548 is fixed
-	fn := subVW_g
-	if len(z) > 32 {
-		fn = subVWlarge
-	}
-	return fn(z, x, y)
-}
-
 func lshVU(z, x []Word, s uint) (c Word) {
 	return lshVU_g(z, x, s)
 }
diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
index 3480e0e676..8a5140e57a 100644
--- a/src/math/big/arith_loong64.s
+++ b/src/math/big/arith_loong64.s
@@ -42,56 +42,6 @@ done:
 	MOVV	R8, c+72(FP)
 	RET
 
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB),NOSPLIT,$0
-	// input:
-	//   R4: z
-	//   R5: z_len
-	//   R7: x
-	//   R10: y
-	MOVV	z+0(FP), R4
-	MOVV	z_len+8(FP), R5
-	MOVV	x+24(FP), R7
-	MOVV	y+48(FP), R10
-	MOVV	$0, R6
-	SLLV	$3, R5
-loop:
-	BEQ	R5, R6, done
-	MOVV	(R6)(R7), R8
-	ADDV	R8, R10, R9	// x1 + c = z1, if z1 < x1 then z1 overflow
-	SGTU	R8, R9, R10
-	MOVV	R9, (R6)(R4)
-	ADDV	$8, R6
-	JMP	loop
-done:
-	MOVV	R10, c+56(FP)
-	RET
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW(SB),NOSPLIT,$0
-	// input:
-	//   R4: z
-	//   R5: z_len
-	//   R7: x
-	//   R10: y
-	MOVV	z+0(FP), R4
-	MOVV	z_len+8(FP), R5
-	MOVV	x+24(FP), R7
-	MOVV	y+48(FP), R10
-	MOVV	$0, R6
-	SLLV	$3, R5
-loop:
-	BEQ	R5, R6, done
-	MOVV	(R6)(R7), R8
-	SUBV	R10, R8, R11	// x1 - c = z1, if z1 > x1 then overflow
-	SGTU	R11, R8, R10
-	MOVV	R11, (R6)(R4)
-	ADDV	$8, R6
-	JMP	loop
-done:
-	MOVV	R10, c+56(FP)
-	RET
-
 TEXT ·lshVU(SB),NOSPLIT,$0
 	JMP ·lshVU_g(SB)
 
diff --git a/src/math/big/arith_mips64x.s b/src/math/big/arith_mips64x.s
index 6c6da48c32..3b32062b06 100644
--- a/src/math/big/arith_mips64x.s
+++ b/src/math/big/arith_mips64x.s
@@ -15,12 +15,6 @@ TEXT ·addVV(SB),NOSPLIT,$0
 TEXT ·subVV(SB),NOSPLIT,$0
 	JMP ·subVV_g(SB)
 
-TEXT ·addVW(SB),NOSPLIT,$0
-	JMP ·addVW_g(SB)
-
-TEXT ·subVW(SB),NOSPLIT,$0
-	JMP ·subVW_g(SB)
-
 TEXT ·lshVU(SB),NOSPLIT,$0
 	JMP ·lshVU_g(SB)
 
diff --git a/src/math/big/arith_mipsx.s b/src/math/big/arith_mipsx.s
index 0e2a0a4b8b..edd7456c3e 100644
--- a/src/math/big/arith_mipsx.s
+++ b/src/math/big/arith_mipsx.s
@@ -15,12 +15,6 @@ TEXT ·addVV(SB),NOSPLIT,$0
 TEXT ·subVV(SB),NOSPLIT,$0
 	JMP	·subVV_g(SB)
 
-TEXT ·addVW(SB),NOSPLIT,$0
-	JMP	·addVW_g(SB)
-
-TEXT ·subVW(SB),NOSPLIT,$0
-	JMP	·subVW_g(SB)
-
 TEXT ·lshVU(SB),NOSPLIT,$0
 	JMP	·lshVU_g(SB)
 
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
index a47ea83aa3..5392c1be26 100644
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -188,157 +188,6 @@ done:
 	MOVD  R4, c+72(FP)
 	RET
 
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB), NOSPLIT, $0
-	MOVD z+0(FP), R10	// R10 = z[]
-	MOVD x+24(FP), R8	// R8 = x[]
-	MOVD y+48(FP), R4	// R4 = y = c
-	MOVD z_len+8(FP), R11	// R11 = z_len
-
-	CMP   R11, $0		// If z_len is zero, return
-	BEQ   done
-
-	// We will process the first iteration out of the loop so we capture
-	// the value of c. In the subsequent iterations, we will rely on the
-	// value of CA set here.
-	MOVD  0(R8), R20	// R20 = x[i]
-	ADD   $-1, R11		// R11 = z_len - 1
-	ADDC  R20, R4, R6	// R6 = x[i] + c
-	CMP   R11, $0		// If z_len was 1, we are done
-	MOVD  R6, 0(R10)	// z[i]
-	BEQ   final
-
-	// We will read 4 elements per iteration
-	SRDCC $2, R11, R9	// R9 = z_len/4
-	DCBT  (R8)
-	MOVD  R9, CTR		// Set up the loop counter
-	BEQ   tail		// If R9 = 0, we can't use the loop
-	PCALIGN $16
-
-loop:
-	MOVD  8(R8), R20	// R20 = x[i]
-	MOVD  16(R8), R21	// R21 = x[i+1]
-	MOVD  24(R8), R22	// R22 = x[i+2]
-	MOVDU 32(R8), R23	// R23 = x[i+3]
-	ADDZE R20, R24		// R24 = x[i] + CA
-	ADDZE R21, R25		// R25 = x[i+1] + CA
-	ADDZE R22, R26		// R26 = x[i+2] + CA
-	ADDZE R23, R27		// R27 = x[i+3] + CA
-	MOVD  R24, 8(R10)	// z[i]
-	MOVD  R25, 16(R10)	// z[i+1]
-	MOVD  R26, 24(R10)	// z[i+2]
-	MOVDU R27, 32(R10)	// z[i+3]
-	ADD   $-4, R11		// R11 = z_len - 4
-	BDNZ  loop
-
-	// We may have some elements to read
-	CMP R11, $0
-	BEQ final
-
-tail:
-	MOVDU 8(R8), R20
-	ADDZE R20, R24
-	ADD $-1, R11
-	MOVDU R24, 8(R10)
-	CMP R11, $0
-	BEQ final
-
-	MOVDU 8(R8), R20
-	ADDZE R20, R24
-	ADD $-1, R11
-	MOVDU R24, 8(R10)
-	CMP R11, $0
-	BEQ final
-
-	MOVD 8(R8), R20
-	ADDZE R20, R24
-	MOVD R24, 8(R10)
-
-final:
-	ADDZE R0, R4		// c = CA
-done:
-	MOVD  R4, c+56(FP)
-	RET
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW(SB), NOSPLIT, $0
-	MOVD  z+0(FP), R10	// R10 = z[]
-	MOVD  x+24(FP), R8	// R8 = x[]
-	MOVD  y+48(FP), R4	// R4 = y = c
-	MOVD  z_len+8(FP), R11	// R11 = z_len
-
-	CMP   R11, $0		// If z_len is zero, return
-	BEQ   done
-
-	// We will process the first iteration out of the loop so we capture
-	// the value of c. In the subsequent iterations, we will rely on the
-	// value of CA set here.
-	MOVD  0(R8), R20	// R20 = x[i]
-	ADD   $-1, R11		// R11 = z_len - 1
-	SUBC  R4, R20, R6	// R6 = x[i] - c
-	CMP   R11, $0		// If z_len was 1, we are done
-	MOVD  R6, 0(R10)	// z[i]
-	BEQ   final
-
-	// We will read 4 elements per iteration
-	SRDCC $2, R11, R9	// R9 = z_len/4
-	DCBT  (R8)
-	MOVD  R9, CTR		// Set up the loop counter
-	BEQ   tail		// If R9 = 0, we can't use the loop
-
-	// The loop here is almost the same as the one used in s390x, but
-	// we don't need to capture CA every iteration because we've already
-	// done that above.
-
-	PCALIGN $16
-loop:
-	MOVD  8(R8), R20
-	MOVD  16(R8), R21
-	MOVD  24(R8), R22
-	MOVDU 32(R8), R23
-	SUBE  R0, R20
-	SUBE  R0, R21
-	SUBE  R0, R22
-	SUBE  R0, R23
-	MOVD  R20, 8(R10)
-	MOVD  R21, 16(R10)
-	MOVD  R22, 24(R10)
-	MOVDU R23, 32(R10)
-	ADD   $-4, R11
-	BDNZ  loop
-
-	// We may have some elements to read
-	CMP   R11, $0
-	BEQ   final
-
-tail:
-	MOVDU 8(R8), R20
-	SUBE  R0, R20
-	ADD   $-1, R11
-	MOVDU R20, 8(R10)
-	CMP   R11, $0
-	BEQ   final
-
-	MOVDU 8(R8), R20
-	SUBE  R0, R20
-	ADD   $-1, R11
-	MOVDU R20, 8(R10)
-	CMP   R11, $0
-	BEQ   final
-
-	MOVD  8(R8), R20
-	SUBE  R0, R20
-	MOVD  R20, 8(R10)
-
-final:
-	// Capture CA
-	SUBE  R4, R4
-	NEG   R4, R4
-
-done:
-	MOVD  R4, c+56(FP)
-	RET
-
 //func lshVU(z, x []Word, s uint) (c Word)
 TEXT ·lshVU(SB), NOSPLIT, $0
 	MOVD    z+0(FP), R3
diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
index 1ba25ce387..406cf38d1f 100644
--- a/src/math/big/arith_riscv64.s
+++ b/src/math/big/arith_riscv64.s
@@ -173,126 +173,6 @@ done:
 	MOV	X29, c+72(FP)	// return b
 	RET
 
-TEXT ·addVW(SB),NOSPLIT,$0
-	MOV	x+24(FP), X5
-	MOV	y+48(FP), X6
-	MOV	z+0(FP), X7
-	MOV	z_len+8(FP), X30
-
-	MOV	$4, X28
-	MOV	X6, X29		// c = y
-
-	BEQZ	X30, done
-	BLTU	X30, X28, loop1
-
-loop4:
-	MOV	0(X5), X8	// x[0]
-	MOV	8(X5), X11	// x[1]
-	MOV	16(X5), X14	// x[2]
-	MOV	24(X5), X17	// x[3]
-
-	ADD	X8, X29, X10	// z[0] = x[0] + c
-	SLTU	X8, X10, X29	// next c
-
-	ADD	X11, X29, X13	// z[1] = x[1] + c
-	SLTU	X11, X13, X29	// next c
-
-	ADD	X14, X29, X16	// z[2] = x[2] + c
-	SLTU	X14, X16, X29	// next c
-
-	ADD	X17, X29, X19	// z[3] = x[3] + c
-	SLTU	X17, X19, X29	// next c
-
-	MOV	X10, 0(X7)	// z[0]
-	MOV	X13, 8(X7)	// z[1]
-	MOV	X16, 16(X7)	// z[2]
-	MOV	X19, 24(X7)	// z[3]
-
-	ADD	$32, X5
-	ADD	$32, X7
-	SUB	$4, X30
-
-	BGEU	X30, X28, loop4
-	BEQZ	X30, done
-
-loop1:
-	MOV	0(X5), X10	// x
-
-	ADD	X10, X29, X12	// z = x + c
-	SLTU	X10, X12, X29	// next c
-
-	MOV	X12, 0(X7)	// z
-
-	ADD	$8, X5
-	ADD	$8, X7
-	SUB	$1, X30
-
-	BNEZ	X30, loop1
-
-done:
-	MOV	X29, c+56(FP)	// return c
-	RET
-
-TEXT ·subVW(SB),NOSPLIT,$0
-	MOV	x+24(FP), X5
-	MOV	y+48(FP), X6
-	MOV	z+0(FP), X7
-	MOV	z_len+8(FP), X30
-
-	MOV	$4, X28
-	MOV	X6, X29		// b = y
-
-	BEQZ	X30, done
-	BLTU	X30, X28, loop1
-
-loop4:
-	MOV	0(X5), X8	// x[0]
-	MOV	8(X5), X11	// x[1]
-	MOV	16(X5), X14	// x[2]
-	MOV	24(X5), X17	// x[3]
-
-	SUB	X29, X8, X10	// z[0] = x[0] - b
-	SLTU	X10, X8, X29	// next b
-
-	SUB	X29, X11, X13	// z[1] = x[1] - b
-	SLTU	X13, X11, X29	// next b
-
-	SUB	X29, X14, X16	// z[2] = x[2] - b
-	SLTU	X16, X14, X29	// next b
-
-	SUB	X29, X17, X19	// z[3] = x[3] - b
-	SLTU	X19, X17, X29	// next b
-
-	MOV	X10, 0(X7)	// z[0]
-	MOV	X13, 8(X7)	// z[1]
-	MOV	X16, 16(X7)	// z[2]
-	MOV	X19, 24(X7)	// z[3]
-
-	ADD	$32, X5
-	ADD	$32, X7
-	SUB	$4, X30
-
-	BGEU	X30, X28, loop4
-	BEQZ	X30, done
-
-loop1:
-	MOV	0(X5), X10	// x
-
-	SUB	X29, X10, X12	// z = x - b
-	SLTU	X12, X10, X29	// next b
-
-	MOV	X12, 0(X7)	// z
-
-	ADD	$8, X5
-	ADD	$8, X7
-	SUB	$1, X30
-
-	BNEZ	X30, loop1
-
-done:
-	MOV	X29, c+56(FP)	// return b
-	RET
-
 TEXT ·lshVU(SB),NOSPLIT,$0
 	JMP ·lshVU_g(SB)
 
diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s
index 57b263a4c3..a03660be62 100644
--- a/src/math/big/arith_s390x.s
+++ b/src/math/big/arith_s390x.s
@@ -500,188 +500,6 @@ E1:
 	MOVD R4, c+72(FP) // return c
 	RET
 
-TEXT ·addVW(SB), NOSPLIT, $0
-	MOVD z_len+8(FP), R5 // length of z
-	MOVD x+24(FP), R6
-	MOVD y+48(FP), R7    // c = y
-	MOVD z+0(FP), R8
-
-	CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
-
-	// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
-	ADDC   0(R6), R7
-	MOVD   R7, 0(R8)
-	CMPBEQ R5, $1, returnResult // len(z) == 1
-	MOVD   $0, R9
-	ADDE   8(R6), R9
-	MOVD   R9, 8(R8)
-	CMPBEQ R5, $2, returnResult // len(z) == 2
-
-	// Update the counters
-	MOVD $16, R12    // i = 2
-	MOVD $-2(R5), R5 // n = n - 2
-
-loopOverEachWord:
-	BRC  $12, copySetup // carry = 0, copy the rest
-	MOVD $1, R9
-
-	// Originally we used the carry flag generated in the previous iteration
-	// (i.e: ADDE could be used here to do the addition).  However, since we
-	// already know carry is 1 (otherwise we will go to copy section), we can use
-	// ADDC here so the current iteration does not depend on the carry flag
-	// generated in the previous iteration. This could be useful when branch prediction happens.
-	ADDC 0(R6)(R12*1), R9
-	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
-
-	MOVD  $8(R12), R12         // i++
-	BRCTG R5, loopOverEachWord // n--
-
-// Return the current carry value
-returnResult:
-	MOVD $0, R0
-	ADDE R0, R0
-	MOVD R0, c+56(FP)
-	RET
-
-// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
-// With the assumption that x and z will not overlap with each other or x and z will
-// point to same memory region, we can use a faster version of copy using only MVC here.
-// In the following implementation, we have three copy loops, each copying a word, 4 words, and
-// 32 words at a time.  Via benchmarking, this implementation is faster than calling runtime·memmove.
-copySetup:
-	ADD R12, R6
-	ADD R12, R8
-
-	CMPBGE R5, $4, mediumLoop
-
-smallLoop:  // does a loop unrolling to copy word when n < 4
-	CMPBEQ R5, $0, returnZero
-	MVC    $8, 0(R6), 0(R8)
-	CMPBEQ R5, $1, returnZero
-	MVC    $8, 8(R6), 8(R8)
-	CMPBEQ R5, $2, returnZero
-	MVC    $8, 16(R6), 16(R8)
-
-returnZero:
-	MOVD $0, c+56(FP) // return 0 as carry
-	RET
-
-mediumLoop:
-	CMPBLT R5, $4, smallLoop
-	CMPBLT R5, $32, mediumLoopBody
-
-largeLoop:  // Copying 256 bytes at a time.
-	MVC    $256, 0(R6), 0(R8)
-	MOVD   $256(R6), R6
-	MOVD   $256(R8), R8
-	MOVD   $-32(R5), R5
-	CMPBGE R5, $32, largeLoop
-	BR     mediumLoop
-
-mediumLoopBody:  // Copying 32 bytes at a time
-	MVC    $32, 0(R6), 0(R8)
-	MOVD   $32(R6), R6
-	MOVD   $32(R8), R8
-	MOVD   $-4(R5), R5
-	CMPBGE R5, $4, mediumLoopBody
-	BR     smallLoop
-
-returnC:
-	MOVD R7, c+56(FP)
-	RET
-
-TEXT ·subVW(SB), NOSPLIT, $0
-	MOVD z_len+8(FP), R5
-	MOVD x+24(FP), R6
-	MOVD y+48(FP), R7    // The borrow bit passed in
-	MOVD z+0(FP), R8
-	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
-
-	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
-
-	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
-	MOVD   0(R6), R9
-	SUBC   R7, R9
-	MOVD   R9, 0(R8)
-	CMPBEQ R5, $1, returnResult
-	MOVD   8(R6), R9
-	SUBE   R0, R9
-	MOVD   R9, 8(R8)
-	CMPBEQ R5, $2, returnResult
-
-	// Update the counters
-	MOVD $16, R12    // i = 2
-	MOVD $-2(R5), R5 // n = n - 2
-
-loopOverEachWord:
-	BRC  $3, copySetup    // no borrow, copy the rest
-	MOVD 0(R6)(R12*1), R9
-
-	// Originally we used the borrow flag generated in the previous iteration
-	// (i.e: SUBE could be used here to do the subtraction). However, since we
-	// already know borrow is 1 (otherwise we will go to copy section), we can
-	// use SUBC here so the current iteration does not depend on the borrow flag
-	// generated in the previous iteration. This could be useful when branch prediction happens.
-	SUBC $1, R9
-	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
-
-	MOVD  $8(R12), R12         // i++
-	BRCTG R5, loopOverEachWord // n--
-
-// return the current borrow value
-returnResult:
-	SUBE R0, R0
-	NEG  R0, R0
-	MOVD R0, c+56(FP)
-	RET
-
-// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
-// With the assumption that x and z will not overlap with each other or x and z will
-// point to same memory region, we can use a faster version of copy using only MVC here.
-// In the following implementation, we have three copy loops, each copying a word, 4 words, and
-// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
-copySetup:
-	ADD R12, R6
-	ADD R12, R8
-
-	CMPBGE R5, $4, mediumLoop
-
-smallLoop:  // does a loop unrolling to copy word when n < 4
-	CMPBEQ R5, $0, returnZero
-	MVC    $8, 0(R6), 0(R8)
-	CMPBEQ R5, $1, returnZero
-	MVC    $8, 8(R6), 8(R8)
-	CMPBEQ R5, $2, returnZero
-	MVC    $8, 16(R6), 16(R8)
-
-returnZero:
-	MOVD $0, c+56(FP) // return 0 as borrow
-	RET
-
-mediumLoop:
-	CMPBLT R5, $4, smallLoop
-	CMPBLT R5, $32, mediumLoopBody
-
-largeLoop:  // Copying 256 bytes at a time
-	MVC    $256, 0(R6), 0(R8)
-	MOVD   $256(R6), R6
-	MOVD   $256(R8), R8
-	MOVD   $-32(R5), R5
-	CMPBGE R5, $32, largeLoop
-	BR     mediumLoop
-
-mediumLoopBody:  // Copying 32 bytes at a time
-	MVC    $32, 0(R6), 0(R8)
-	MOVD   $32(R6), R6
-	MOVD   $32(R8), R8
-	MOVD   $-4(R5), R5
-	CMPBGE R5, $4, mediumLoopBody
-	BR     smallLoop
-
-returnC:
-	MOVD R7, c+56(FP)
-	RET
-
 // func lshVU(z, x []Word, s uint) (c Word)
 TEXT ·lshVU(SB), NOSPLIT, $0
 	BR ·lshVU_g(SB)
diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go
index b6e7304a13..bd9f96870b 100644
--- a/src/math/big/arith_test.go
+++ b/src/math/big/arith_test.go
@@ -28,8 +28,8 @@ var shifts = []uint{1, 2, 3, _W/4 - 1, _W / 4, _W/4 + 1, _W/2 - 1, _W / 2, _W/2
 
 func TestAddVV(t *testing.T)      { testVV(t, "addVV", addVV, addVV_g) }
 func TestSubVV(t *testing.T)      { testVV(t, "subVV", subVV, subVV_g) }
-func TestAddVW(t *testing.T)      { testVW(t, "addVW", addVW, addVW_g, words4) }
-func TestSubVW(t *testing.T)      { testVW(t, "subVW", subVW, subVW_g, words4) }
+func TestAddVW(t *testing.T)      { testVW(t, "addVW", addVW, addVW_ref, words4) }
+func TestSubVW(t *testing.T)      { testVW(t, "subVW", subVW, subVW_ref, words4) }
 func TestLshVU(t *testing.T)      { testVU(t, "lshVU", lshVU, lshVU_g, shifts) }
 func TestRshVU(t *testing.T)      { testVU(t, "rshVU", rshVU, rshVU_g, shifts) }
 func TestMulAddVWW(t *testing.T)  { testVWW(t, "mulAddVWW", mulAddVWW, mulAddVWW_g, muls) }
@@ -865,21 +865,15 @@ func benchVV(fn func(z, x, y []Word) Word) benchFunc {
 }
 
 func BenchmarkAddVW(b *testing.B) {
-	bench(b, "/impl=asm/data=random", benchVW(addVW, 123))
-	bench(b, "/impl=asm/data=carry", benchCarryVW(addVW, ^Word(0), 1))
-	bench(b, "/impl=asm/data=shortcut", benchShortVW(addVW, 123))
-	bench(b, "/impl=go/data=random", benchVW(addVW_g, 123))
-	bench(b, "/impl=go/data=carry", benchCarryVW(addVW_g, ^Word(0), 1))
-	bench(b, "/impl=go/data=shortcut", benchShortVW(addVW_g, 123))
+	bench(b, "/data=random", benchVW(addVW, 123))
+	bench(b, "/data=carry", benchCarryVW(addVW, ^Word(0), 1))
+	bench(b, "/data=shortcut", benchShortVW(addVW, 123))
 }
 
 func BenchmarkSubVW(b *testing.B) {
-	bench(b, "/impl=asm/data=random", benchVW(subVW, 123))
-	bench(b, "/impl=asm/data=carry", benchCarryVW(subVW, 0, 1))
-	bench(b, "/impl=asm/data=shortcut", benchShortVW(subVW, 123))
-	bench(b, "/impl=go/data=random", benchVW(subVW_g, 123))
-	bench(b, "/impl=go/data=carry", benchCarryVW(subVW_g, 0, 1))
-	bench(b, "/impl=go/data=shortcut", benchShortVW(subVW_g, 123))
+	bench(b, "/data=random", benchVW(subVW, 123))
+	bench(b, "/data=carry", benchCarryVW(subVW, 0, 1))
+	bench(b, "/data=shortcut", benchShortVW(subVW, 123))
 }
 
 func benchVW(fn func(z, x []Word, w Word) Word, w Word) benchFunc {
diff --git a/src/math/big/arith_wasm.s b/src/math/big/arith_wasm.s
index 8aadeaa28d..3a9aa4ddcb 100644
--- a/src/math/big/arith_wasm.s
+++ b/src/math/big/arith_wasm.s
@@ -12,12 +12,6 @@ TEXT ·addVV(SB),NOSPLIT,$0
 TEXT ·subVV(SB),NOSPLIT,$0
 	JMP ·subVV_g(SB)
 
-TEXT ·addVW(SB),NOSPLIT,$0
-	JMP ·addVW_g(SB)
-
-TEXT ·subVW(SB),NOSPLIT,$0
-	JMP ·subVW_g(SB)
-
 TEXT ·lshVU(SB),NOSPLIT,$0
 	JMP ·lshVU_g(SB)