diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go index 1dbd68cd67..760bb7a999 100644 --- a/src/cmd/compile/internal/test/inl_test.go +++ b/src/cmd/compile/internal/test/inl_test.go @@ -175,9 +175,6 @@ func TestIntendedInlining(t *testing.T) { }, "math/big": { "bigEndianWord", - // The following functions require the math_big_pure_go build tag. - "addVW", - "subVW", }, "math/rand": { "(*rngSource).Int63", diff --git a/src/math/big/arith.go b/src/math/big/arith.go index cd2b8a4228..e2cd99f602 100644 --- a/src/math/big/arith.go +++ b/src/math/big/arith.go @@ -10,7 +10,10 @@ package big -import "math/bits" +import ( + "math/bits" + _ "unsafe" // for go:linkname +) // A Word represents a single digit of a multi-precision unsigned integer. type Word uint @@ -82,11 +85,50 @@ func subVV_g(z, x, y []Word) (c Word) { return } -// The resulting carry c is either 0 or 1. -func addVW_g(z, x []Word, y Word) (c Word) { +// addVW sets z = x + y, returning the final carry c. +// The behavior is undefined if len(x) != len(z). +// If len(z) == 0, c = y; otherwise, c is 0 or 1. +// +// addVW should be an internal detail, +// but widely used packages access it using linkname. +// Notable members of the hall of shame include: +// - github.com/remyoudompheng/bigfft +// +// Do not remove or change the type signature. +// See go.dev/issue/67401. +// +//go:linkname addVW +func addVW(z, x []Word, y Word) (c Word) { + x = x[:len(z)] + if len(z) == 0 { + return y + } + zi, cc := bits.Add(uint(x[0]), uint(y), 0) + z[0] = Word(zi) + if cc == 0 { + if &z[0] != &x[0] { + copy(z[1:], x[1:]) + } + return 0 + } + for i := 1; i < len(z); i++ { + xi := x[i] + if xi != ^Word(0) { + z[i] = xi + 1 + if &z[0] != &x[0] { + copy(z[i+1:], x[i+1:]) + } + return 0 + } + z[i] = 0 + } + return 1 +} + +// addVW_ref is the reference implementation for addVW, used only for testing. +func addVW_ref(z, x []Word, y Word) (c Word) { c = y - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x); i++ { + for i := range z { zi, cc := bits.Add(uint(x[i]), uint(c), 0) z[i] = Word(zi) c = Word(cc) @@ -94,53 +136,55 @@ func addVW_g(z, x []Word, y Word) (c Word) { return } -// addVWlarge is addVW, but intended for large z. -// The only difference is that we check on every iteration -// whether we are done with carries, -// and if so, switch to a much faster copy instead. -// This is only a good idea for large z, -// because the overhead of the check and the function call -// outweigh the benefits when z is small. -func addVWlarge(z, x []Word, y Word) (c Word) { - c = y - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x); i++ { - if c == 0 { - copy(z[i:], x[i:]) - return - } - zi, cc := bits.Add(uint(x[i]), uint(c), 0) - z[i] = Word(zi) - c = Word(cc) +// subVW sets z = x - y, returning the final carry c. +// The behavior is undefined if len(x) != len(z). +// If len(z) == 0, c = y; otherwise, c is 0 or 1. +// +// subVW should be an internal detail, +// but widely used packages access it using linkname. +// Notable members of the hall of shame include: +// - github.com/remyoudompheng/bigfft +// +// Do not remove or change the type signature. +// See go.dev/issue/67401. +// +//go:linkname subVW +func subVW(z, x []Word, y Word) (c Word) { + x = x[:len(z)] + if len(z) == 0 { + return y } - return + zi, cc := bits.Sub(uint(x[0]), uint(y), 0) + z[0] = Word(zi) + if cc == 0 { + if &z[0] != &x[0] { + copy(z[1:], x[1:]) + } + return 0 + } + for i := 1; i < len(z); i++ { + xi := x[i] + if xi != 0 { + z[i] = xi - 1 + if &z[0] != &x[0] { + copy(z[i+1:], x[i+1:]) + } + return 0 + } + z[i] = ^Word(0) + } + return 1 } -func subVW_g(z, x []Word, y Word) (c Word) { +// subVW_ref is the reference implementation for subVW, used only for testing. +func subVW_ref(z, x []Word, y Word) (c Word) { c = y - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x); i++ { + for i := range z { zi, cc := bits.Sub(uint(x[i]), uint(c), 0) z[i] = Word(zi) c = Word(cc) } - return -} - -// subVWlarge is to subVW as addVWlarge is to addVW. -func subVWlarge(z, x []Word, y Word) (c Word) { - c = y - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x); i++ { - if c == 0 { - copy(z[i:], x[i:]) - return - } - zi, cc := bits.Sub(uint(x[i]), uint(c), 0) - z[i] = Word(zi) - c = Word(cc) - } - return + return c } func lshVU_g(z, x []Word, s uint) (c Word) { diff --git a/src/math/big/arith_386.s b/src/math/big/arith_386.s index c3567c632d..a989503c1c 100644 --- a/src/math/big/arith_386.s +++ b/src/math/big/arith_386.s @@ -60,51 +60,6 @@ E2: CMPL BX, BP // i < n RET -// func addVW(z, x []Word, y Word) (c Word) -TEXT ·addVW(SB),NOSPLIT,$0 - MOVL z+0(FP), DI - MOVL x+12(FP), SI - MOVL y+24(FP), AX // c = y - MOVL z_len+4(FP), BP - MOVL $0, BX // i = 0 - JMP E3 - -L3: ADDL (SI)(BX*4), AX - MOVL AX, (DI)(BX*4) - SBBL AX, AX // save CF - NEGL AX - ADDL $1, BX // i++ - -E3: CMPL BX, BP // i < n - JL L3 - - MOVL AX, c+28(FP) - RET - - -// func subVW(z, x []Word, y Word) (c Word) -TEXT ·subVW(SB),NOSPLIT,$0 - MOVL z+0(FP), DI - MOVL x+12(FP), SI - MOVL y+24(FP), AX // c = y - MOVL z_len+4(FP), BP - MOVL $0, BX // i = 0 - JMP E4 - -L4: MOVL (SI)(BX*4), DX - SUBL AX, DX - MOVL DX, (DI)(BX*4) - SBBL AX, AX // save CF - NEGL AX - ADDL $1, BX // i++ - -E4: CMPL BX, BP // i < n - JL L4 - - MOVL AX, c+28(FP) - RET - - // func lshVU(z, x []Word, s uint) (c Word) TEXT ·lshVU(SB),NOSPLIT,$0 MOVL z_len+4(FP), BX // i = z diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s index 2e1d68f935..66bc6d41ce 100644 --- a/src/math/big/arith_amd64.s +++ b/src/math/big/arith_amd64.s @@ -121,119 +121,6 @@ E2: NEGQ CX MOVQ CX, c+72(FP) // return c RET - -// func addVW(z, x []Word, y Word) (c Word) -TEXT ·addVW(SB),NOSPLIT,$0 - MOVQ z_len+8(FP), DI - CMPQ DI, $32 - JG large - MOVQ x+24(FP), R8 - MOVQ y+48(FP), CX // c = y - MOVQ z+0(FP), R10 - - MOVQ $0, SI // i = 0 - - // s/JL/JMP/ below to disable the unrolled loop - SUBQ $4, DI // n -= 4 - JL V3 // if n < 4 goto V3 - -U3: // n >= 0 - // regular loop body unrolled 4x - MOVQ 0(R8)(SI*8), R11 - MOVQ 8(R8)(SI*8), R12 - MOVQ 16(R8)(SI*8), R13 - MOVQ 24(R8)(SI*8), R14 - ADDQ CX, R11 - ADCQ $0, R12 - ADCQ $0, R13 - ADCQ $0, R14 - SBBQ CX, CX // save CF - NEGQ CX - MOVQ R11, 0(R10)(SI*8) - MOVQ R12, 8(R10)(SI*8) - MOVQ R13, 16(R10)(SI*8) - MOVQ R14, 24(R10)(SI*8) - - ADDQ $4, SI // i += 4 - SUBQ $4, DI // n -= 4 - JGE U3 // if n >= 0 goto U3 - -V3: ADDQ $4, DI // n += 4 - JLE E3 // if n <= 0 goto E3 - -L3: // n > 0 - ADDQ 0(R8)(SI*8), CX - MOVQ CX, 0(R10)(SI*8) - SBBQ CX, CX // save CF - NEGQ CX - - ADDQ $1, SI // i++ - SUBQ $1, DI // n-- - JG L3 // if n > 0 goto L3 - -E3: MOVQ CX, c+56(FP) // return c - RET -large: - JMP ·addVWlarge(SB) - - -// func subVW(z, x []Word, y Word) (c Word) -// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) -TEXT ·subVW(SB),NOSPLIT,$0 - MOVQ z_len+8(FP), DI - CMPQ DI, $32 - JG large - MOVQ x+24(FP), R8 - MOVQ y+48(FP), CX // c = y - MOVQ z+0(FP), R10 - - MOVQ $0, SI // i = 0 - - // s/JL/JMP/ below to disable the unrolled loop - SUBQ $4, DI // n -= 4 - JL V4 // if n < 4 goto V4 - -U4: // n >= 0 - // regular loop body unrolled 4x - MOVQ 0(R8)(SI*8), R11 - MOVQ 8(R8)(SI*8), R12 - MOVQ 16(R8)(SI*8), R13 - MOVQ 24(R8)(SI*8), R14 - SUBQ CX, R11 - SBBQ $0, R12 - SBBQ $0, R13 - SBBQ $0, R14 - SBBQ CX, CX // save CF - NEGQ CX - MOVQ R11, 0(R10)(SI*8) - MOVQ R12, 8(R10)(SI*8) - MOVQ R13, 16(R10)(SI*8) - MOVQ R14, 24(R10)(SI*8) - - ADDQ $4, SI // i += 4 - SUBQ $4, DI // n -= 4 - JGE U4 // if n >= 0 goto U4 - -V4: ADDQ $4, DI // n += 4 - JLE E4 // if n <= 0 goto E4 - -L4: // n > 0 - MOVQ 0(R8)(SI*8), R11 - SUBQ CX, R11 - MOVQ R11, 0(R10)(SI*8) - SBBQ CX, CX // save CF - NEGQ CX - - ADDQ $1, SI // i++ - SUBQ $1, DI // n-- - JG L4 // if n > 0 goto L4 - -E4: MOVQ CX, c+56(FP) // return c - RET -large: - JMP ·subVWlarge(SB) - - // func lshVU(z, x []Word, s uint) (c Word) TEXT ·lshVU(SB),NOSPLIT,$0 MOVQ z_len+8(FP), BX // i = z diff --git a/src/math/big/arith_arm.s b/src/math/big/arith_arm.s index 5b04e07bd0..ce9fe5f6fb 100644 --- a/src/math/big/arith_arm.s +++ b/src/math/big/arith_arm.s @@ -58,66 +58,6 @@ E2: RET -// func addVW(z, x []Word, y Word) (c Word) -TEXT ·addVW(SB),NOSPLIT,$0 - MOVW z+0(FP), R1 - MOVW z_len+4(FP), R4 - MOVW x+12(FP), R2 - MOVW y+24(FP), R3 - ADD R4<<2, R1, R4 - TEQ R1, R4 - BNE L3a - MOVW R3, c+28(FP) - RET -L3a: - MOVW.P 4(R2), R5 - ADD.S R3, R5 - MOVW.P R5, 4(R1) - B E3 -L3: - MOVW.P 4(R2), R5 - ADC.S $0, R5 - MOVW.P R5, 4(R1) -E3: - TEQ R1, R4 - BNE L3 - - MOVW $0, R0 - MOVW.CS $1, R0 - MOVW R0, c+28(FP) - RET - - -// func subVW(z, x []Word, y Word) (c Word) -TEXT ·subVW(SB),NOSPLIT,$0 - MOVW z+0(FP), R1 - MOVW z_len+4(FP), R4 - MOVW x+12(FP), R2 - MOVW y+24(FP), R3 - ADD R4<<2, R1, R4 - TEQ R1, R4 - BNE L4a - MOVW R3, c+28(FP) - RET -L4a: - MOVW.P 4(R2), R5 - SUB.S R3, R5 - MOVW.P R5, 4(R1) - B E4 -L4: - MOVW.P 4(R2), R5 - SBC.S $0, R5 - MOVW.P R5, 4(R1) -E4: - TEQ R1, R4 - BNE L4 - - MOVW $0, R0 - MOVW.CC $1, R0 - MOVW R0, c+28(FP) - RET - - // func lshVU(z, x []Word, s uint) (c Word) TEXT ·lshVU(SB),NOSPLIT,$0 MOVW z_len+4(FP), R5 diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s index e0a8b39e78..aa7dd6755d 100644 --- a/src/math/big/arith_arm64.s +++ b/src/math/big/arith_arm64.s @@ -93,164 +93,6 @@ done: MOVD R0, c+72(FP) RET -#define vwOneOp(instr, op1) \ - MOVD.P 8(R1), R4; \ - instr op1, R4; \ - MOVD.P R4, 8(R3); - -// handle the first 1~4 elements before starting iteration in addVW/subVW -#define vwPreIter(instr1, instr2, counter, target) \ - vwOneOp(instr1, R2); \ - SUB $1, counter; \ - CBZ counter, target; \ - vwOneOp(instr2, $0); \ - SUB $1, counter; \ - CBZ counter, target; \ - vwOneOp(instr2, $0); \ - SUB $1, counter; \ - CBZ counter, target; \ - vwOneOp(instr2, $0); - -// do one iteration of add or sub in addVW/subVW -#define vwOneIter(instr, counter, exit) \ - CBZ counter, exit; \ // careful not to touch the carry flag - LDP.P 32(R1), (R4, R5); \ - LDP -16(R1), (R6, R7); \ - instr $0, R4, R8; \ - instr $0, R5, R9; \ - instr $0, R6, R10; \ - instr $0, R7, R11; \ - STP.P (R8, R9), 32(R3); \ - STP (R10, R11), -16(R3); \ - SUB $4, counter; - -// do one iteration of copy in addVW/subVW -#define vwOneIterCopy(counter, exit) \ - CBZ counter, exit; \ - LDP.P 32(R1), (R4, R5); \ - LDP -16(R1), (R6, R7); \ - STP.P (R4, R5), 32(R3); \ - STP (R6, R7), -16(R3); \ - SUB $4, counter; - -// func addVW(z, x []Word, y Word) (c Word) -// The 'large' branch handles large 'z'. It checks the carry flag on every iteration -// and switches to copy if we are done with carries. The copying is skipped as well -// if 'x' and 'z' happen to share the same underlying storage. -// The overhead of the checking and branching is visible when 'z' are small (~5%), -// so set a threshold of 32, and remain the small-sized part entirely untouched. -TEXT ·addVW(SB),NOSPLIT,$0 - MOVD z+0(FP), R3 - MOVD z_len+8(FP), R0 - MOVD x+24(FP), R1 - MOVD y+48(FP), R2 - CMP $32, R0 - BGE large // large-sized 'z' and 'x' - CBZ R0, len0 // the length of z is 0 - MOVD.P 8(R1), R4 - ADDS R2, R4 // z[0] = x[0] + y, set carry - MOVD.P R4, 8(R3) - SUB $1, R0 - CBZ R0, len1 // the length of z is 1 - TBZ $0, R0, two - MOVD.P 8(R1), R4 // do it once - ADCS $0, R4 - MOVD.P R4, 8(R3) - SUB $1, R0 -two: // do it twice - TBZ $1, R0, loop - LDP.P 16(R1), (R4, R5) - ADCS $0, R4, R8 // c, z[i] = x[i] + c - ADCS $0, R5, R9 - STP.P (R8, R9), 16(R3) - SUB $2, R0 -loop: // do four times per round - vwOneIter(ADCS, R0, len1) - B loop -len1: - CSET HS, R2 // extract carry flag -len0: - MOVD R2, c+56(FP) -done: - RET -large: - AND $0x3, R0, R10 - AND $~0x3, R0 - // unrolling for the first 1~4 elements to avoid saving the carry - // flag in each step, adjust $R0 if we unrolled 4 elements - vwPreIter(ADDS, ADCS, R10, add4) - SUB $4, R0 -add4: - BCC copy - vwOneIter(ADCS, R0, len1) - B add4 -copy: - MOVD ZR, c+56(FP) - CMP R1, R3 - BEQ done -copy_4: // no carry flag, copy the rest - vwOneIterCopy(R0, done) - B copy_4 - -// func subVW(z, x []Word, y Word) (c Word) -// The 'large' branch handles large 'z'. It checks the carry flag on every iteration -// and switches to copy if we are done with carries. The copying is skipped as well -// if 'x' and 'z' happen to share the same underlying storage. -// The overhead of the checking and branching is visible when 'z' are small (~5%), -// so set a threshold of 32, and remain the small-sized part entirely untouched. -TEXT ·subVW(SB),NOSPLIT,$0 - MOVD z+0(FP), R3 - MOVD z_len+8(FP), R0 - MOVD x+24(FP), R1 - MOVD y+48(FP), R2 - CMP $32, R0 - BGE large // large-sized 'z' and 'x' - CBZ R0, len0 // the length of z is 0 - MOVD.P 8(R1), R4 - SUBS R2, R4 // z[0] = x[0] - y, set carry - MOVD.P R4, 8(R3) - SUB $1, R0 - CBZ R0, len1 // the length of z is 1 - TBZ $0, R0, two // do it once - MOVD.P 8(R1), R4 - SBCS $0, R4 - MOVD.P R4, 8(R3) - SUB $1, R0 -two: // do it twice - TBZ $1, R0, loop - LDP.P 16(R1), (R4, R5) - SBCS $0, R4, R8 // c, z[i] = x[i] + c - SBCS $0, R5, R9 - STP.P (R8, R9), 16(R3) - SUB $2, R0 -loop: // do four times per round - vwOneIter(SBCS, R0, len1) - B loop -len1: - CSET LO, R2 // extract carry flag -len0: - MOVD R2, c+56(FP) -done: - RET -large: - AND $0x3, R0, R10 - AND $~0x3, R0 - // unrolling for the first 1~4 elements to avoid saving the carry - // flag in each step, adjust $R0 if we unrolled 4 elements - vwPreIter(SUBS, SBCS, R10, sub4) - SUB $4, R0 -sub4: - BCS copy - vwOneIter(SBCS, R0, len1) - B sub4 -copy: - MOVD ZR, c+56(FP) - CMP R1, R3 - BEQ done -copy_4: // no carry flag, copy the rest - vwOneIterCopy(R0, done) - B copy_4 - // func lshVU(z, x []Word, s uint) (c Word) // This implementation handles the shift operation from the high word to the low word, // which may be an error for the case where the low word of x overlaps with the high diff --git a/src/math/big/arith_decl.go b/src/math/big/arith_decl.go index ca73485df0..aa838808b9 100644 --- a/src/math/big/arith_decl.go +++ b/src/math/big/arith_decl.go @@ -34,30 +34,6 @@ func addVV(z, x, y []Word) (c Word) //go:noescape func subVV(z, x, y []Word) (c Word) -// addVW should be an internal detail, -// but widely used packages access it using linkname. -// Notable members of the hall of shame include: -// - github.com/remyoudompheng/bigfft -// -// Do not remove or change the type signature. -// See go.dev/issue/67401. -// -//go:linkname addVW -//go:noescape -func addVW(z, x []Word, y Word) (c Word) - -// subVW should be an internal detail, -// but widely used packages access it using linkname. -// Notable members of the hall of shame include: -// - github.com/remyoudompheng/bigfft -// -// Do not remove or change the type signature. -// See go.dev/issue/67401. -// -//go:linkname subVW -//go:noescape -func subVW(z, x []Word, y Word) (c Word) - // shlVU should be an internal detail (and a stale one at that), // but widely used packages access it using linkname. // Notable members of the hall of shame include: diff --git a/src/math/big/arith_decl_pure.go b/src/math/big/arith_decl_pure.go index 60672d3e6c..3b051356fb 100644 --- a/src/math/big/arith_decl_pure.go +++ b/src/math/big/arith_decl_pure.go @@ -14,24 +14,6 @@ func subVV(z, x, y []Word) (c Word) { return subVV_g(z, x, y) } -func addVW(z, x []Word, y Word) (c Word) { - // TODO: remove indirect function call when golang.org/issue/30548 is fixed - fn := addVW_g - if len(z) > 32 { - fn = addVWlarge - } - return fn(z, x, y) -} - -func subVW(z, x []Word, y Word) (c Word) { - // TODO: remove indirect function call when golang.org/issue/30548 is fixed - fn := subVW_g - if len(z) > 32 { - fn = subVWlarge - } - return fn(z, x, y) -} - func lshVU(z, x []Word, s uint) (c Word) { return lshVU_g(z, x, s) } diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s index 3480e0e676..8a5140e57a 100644 --- a/src/math/big/arith_loong64.s +++ b/src/math/big/arith_loong64.s @@ -42,56 +42,6 @@ done: MOVV R8, c+72(FP) RET -// func addVW(z, x []Word, y Word) (c Word) -TEXT ·addVW(SB),NOSPLIT,$0 - // input: - // R4: z - // R5: z_len - // R7: x - // R10: y - MOVV z+0(FP), R4 - MOVV z_len+8(FP), R5 - MOVV x+24(FP), R7 - MOVV y+48(FP), R10 - MOVV $0, R6 - SLLV $3, R5 -loop: - BEQ R5, R6, done - MOVV (R6)(R7), R8 - ADDV R8, R10, R9 // x1 + c = z1, if z1 < x1 then z1 overflow - SGTU R8, R9, R10 - MOVV R9, (R6)(R4) - ADDV $8, R6 - JMP loop -done: - MOVV R10, c+56(FP) - RET - -// func subVW(z, x []Word, y Word) (c Word) -TEXT ·subVW(SB),NOSPLIT,$0 - // input: - // R4: z - // R5: z_len - // R7: x - // R10: y - MOVV z+0(FP), R4 - MOVV z_len+8(FP), R5 - MOVV x+24(FP), R7 - MOVV y+48(FP), R10 - MOVV $0, R6 - SLLV $3, R5 -loop: - BEQ R5, R6, done - MOVV (R6)(R7), R8 - SUBV R10, R8, R11 // x1 - c = z1, if z1 > x1 then overflow - SGTU R11, R8, R10 - MOVV R11, (R6)(R4) - ADDV $8, R6 - JMP loop -done: - MOVV R10, c+56(FP) - RET - TEXT ·lshVU(SB),NOSPLIT,$0 JMP ·lshVU_g(SB) diff --git a/src/math/big/arith_mips64x.s b/src/math/big/arith_mips64x.s index 6c6da48c32..3b32062b06 100644 --- a/src/math/big/arith_mips64x.s +++ b/src/math/big/arith_mips64x.s @@ -15,12 +15,6 @@ TEXT ·addVV(SB),NOSPLIT,$0 TEXT ·subVV(SB),NOSPLIT,$0 JMP ·subVV_g(SB) -TEXT ·addVW(SB),NOSPLIT,$0 - JMP ·addVW_g(SB) - -TEXT ·subVW(SB),NOSPLIT,$0 - JMP ·subVW_g(SB) - TEXT ·lshVU(SB),NOSPLIT,$0 JMP ·lshVU_g(SB) diff --git a/src/math/big/arith_mipsx.s b/src/math/big/arith_mipsx.s index 0e2a0a4b8b..edd7456c3e 100644 --- a/src/math/big/arith_mipsx.s +++ b/src/math/big/arith_mipsx.s @@ -15,12 +15,6 @@ TEXT ·addVV(SB),NOSPLIT,$0 TEXT ·subVV(SB),NOSPLIT,$0 JMP ·subVV_g(SB) -TEXT ·addVW(SB),NOSPLIT,$0 - JMP ·addVW_g(SB) - -TEXT ·subVW(SB),NOSPLIT,$0 - JMP ·subVW_g(SB) - TEXT ·lshVU(SB),NOSPLIT,$0 JMP ·lshVU_g(SB) diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index a47ea83aa3..5392c1be26 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -188,157 +188,6 @@ done: MOVD R4, c+72(FP) RET -// func addVW(z, x []Word, y Word) (c Word) -TEXT ·addVW(SB), NOSPLIT, $0 - MOVD z+0(FP), R10 // R10 = z[] - MOVD x+24(FP), R8 // R8 = x[] - MOVD y+48(FP), R4 // R4 = y = c - MOVD z_len+8(FP), R11 // R11 = z_len - - CMP R11, $0 // If z_len is zero, return - BEQ done - - // We will process the first iteration out of the loop so we capture - // the value of c. In the subsequent iterations, we will rely on the - // value of CA set here. - MOVD 0(R8), R20 // R20 = x[i] - ADD $-1, R11 // R11 = z_len - 1 - ADDC R20, R4, R6 // R6 = x[i] + c - CMP R11, $0 // If z_len was 1, we are done - MOVD R6, 0(R10) // z[i] - BEQ final - - // We will read 4 elements per iteration - SRDCC $2, R11, R9 // R9 = z_len/4 - DCBT (R8) - MOVD R9, CTR // Set up the loop counter - BEQ tail // If R9 = 0, we can't use the loop - PCALIGN $16 - -loop: - MOVD 8(R8), R20 // R20 = x[i] - MOVD 16(R8), R21 // R21 = x[i+1] - MOVD 24(R8), R22 // R22 = x[i+2] - MOVDU 32(R8), R23 // R23 = x[i+3] - ADDZE R20, R24 // R24 = x[i] + CA - ADDZE R21, R25 // R25 = x[i+1] + CA - ADDZE R22, R26 // R26 = x[i+2] + CA - ADDZE R23, R27 // R27 = x[i+3] + CA - MOVD R24, 8(R10) // z[i] - MOVD R25, 16(R10) // z[i+1] - MOVD R26, 24(R10) // z[i+2] - MOVDU R27, 32(R10) // z[i+3] - ADD $-4, R11 // R11 = z_len - 4 - BDNZ loop - - // We may have some elements to read - CMP R11, $0 - BEQ final - -tail: - MOVDU 8(R8), R20 - ADDZE R20, R24 - ADD $-1, R11 - MOVDU R24, 8(R10) - CMP R11, $0 - BEQ final - - MOVDU 8(R8), R20 - ADDZE R20, R24 - ADD $-1, R11 - MOVDU R24, 8(R10) - CMP R11, $0 - BEQ final - - MOVD 8(R8), R20 - ADDZE R20, R24 - MOVD R24, 8(R10) - -final: - ADDZE R0, R4 // c = CA -done: - MOVD R4, c+56(FP) - RET - -// func subVW(z, x []Word, y Word) (c Word) -TEXT ·subVW(SB), NOSPLIT, $0 - MOVD z+0(FP), R10 // R10 = z[] - MOVD x+24(FP), R8 // R8 = x[] - MOVD y+48(FP), R4 // R4 = y = c - MOVD z_len+8(FP), R11 // R11 = z_len - - CMP R11, $0 // If z_len is zero, return - BEQ done - - // We will process the first iteration out of the loop so we capture - // the value of c. In the subsequent iterations, we will rely on the - // value of CA set here. - MOVD 0(R8), R20 // R20 = x[i] - ADD $-1, R11 // R11 = z_len - 1 - SUBC R4, R20, R6 // R6 = x[i] - c - CMP R11, $0 // If z_len was 1, we are done - MOVD R6, 0(R10) // z[i] - BEQ final - - // We will read 4 elements per iteration - SRDCC $2, R11, R9 // R9 = z_len/4 - DCBT (R8) - MOVD R9, CTR // Set up the loop counter - BEQ tail // If R9 = 0, we can't use the loop - - // The loop here is almost the same as the one used in s390x, but - // we don't need to capture CA every iteration because we've already - // done that above. - - PCALIGN $16 -loop: - MOVD 8(R8), R20 - MOVD 16(R8), R21 - MOVD 24(R8), R22 - MOVDU 32(R8), R23 - SUBE R0, R20 - SUBE R0, R21 - SUBE R0, R22 - SUBE R0, R23 - MOVD R20, 8(R10) - MOVD R21, 16(R10) - MOVD R22, 24(R10) - MOVDU R23, 32(R10) - ADD $-4, R11 - BDNZ loop - - // We may have some elements to read - CMP R11, $0 - BEQ final - -tail: - MOVDU 8(R8), R20 - SUBE R0, R20 - ADD $-1, R11 - MOVDU R20, 8(R10) - CMP R11, $0 - BEQ final - - MOVDU 8(R8), R20 - SUBE R0, R20 - ADD $-1, R11 - MOVDU R20, 8(R10) - CMP R11, $0 - BEQ final - - MOVD 8(R8), R20 - SUBE R0, R20 - MOVD R20, 8(R10) - -final: - // Capture CA - SUBE R4, R4 - NEG R4, R4 - -done: - MOVD R4, c+56(FP) - RET - //func lshVU(z, x []Word, s uint) (c Word) TEXT ·lshVU(SB), NOSPLIT, $0 MOVD z+0(FP), R3 diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s index 1ba25ce387..406cf38d1f 100644 --- a/src/math/big/arith_riscv64.s +++ b/src/math/big/arith_riscv64.s @@ -173,126 +173,6 @@ done: MOV X29, c+72(FP) // return b RET -TEXT ·addVW(SB),NOSPLIT,$0 - MOV x+24(FP), X5 - MOV y+48(FP), X6 - MOV z+0(FP), X7 - MOV z_len+8(FP), X30 - - MOV $4, X28 - MOV X6, X29 // c = y - - BEQZ X30, done - BLTU X30, X28, loop1 - -loop4: - MOV 0(X5), X8 // x[0] - MOV 8(X5), X11 // x[1] - MOV 16(X5), X14 // x[2] - MOV 24(X5), X17 // x[3] - - ADD X8, X29, X10 // z[0] = x[0] + c - SLTU X8, X10, X29 // next c - - ADD X11, X29, X13 // z[1] = x[1] + c - SLTU X11, X13, X29 // next c - - ADD X14, X29, X16 // z[2] = x[2] + c - SLTU X14, X16, X29 // next c - - ADD X17, X29, X19 // z[3] = x[3] + c - SLTU X17, X19, X29 // next c - - MOV X10, 0(X7) // z[0] - MOV X13, 8(X7) // z[1] - MOV X16, 16(X7) // z[2] - MOV X19, 24(X7) // z[3] - - ADD $32, X5 - ADD $32, X7 - SUB $4, X30 - - BGEU X30, X28, loop4 - BEQZ X30, done - -loop1: - MOV 0(X5), X10 // x - - ADD X10, X29, X12 // z = x + c - SLTU X10, X12, X29 // next c - - MOV X12, 0(X7) // z - - ADD $8, X5 - ADD $8, X7 - SUB $1, X30 - - BNEZ X30, loop1 - -done: - MOV X29, c+56(FP) // return c - RET - -TEXT ·subVW(SB),NOSPLIT,$0 - MOV x+24(FP), X5 - MOV y+48(FP), X6 - MOV z+0(FP), X7 - MOV z_len+8(FP), X30 - - MOV $4, X28 - MOV X6, X29 // b = y - - BEQZ X30, done - BLTU X30, X28, loop1 - -loop4: - MOV 0(X5), X8 // x[0] - MOV 8(X5), X11 // x[1] - MOV 16(X5), X14 // x[2] - MOV 24(X5), X17 // x[3] - - SUB X29, X8, X10 // z[0] = x[0] - b - SLTU X10, X8, X29 // next b - - SUB X29, X11, X13 // z[1] = x[1] - b - SLTU X13, X11, X29 // next b - - SUB X29, X14, X16 // z[2] = x[2] - b - SLTU X16, X14, X29 // next b - - SUB X29, X17, X19 // z[3] = x[3] - b - SLTU X19, X17, X29 // next b - - MOV X10, 0(X7) // z[0] - MOV X13, 8(X7) // z[1] - MOV X16, 16(X7) // z[2] - MOV X19, 24(X7) // z[3] - - ADD $32, X5 - ADD $32, X7 - SUB $4, X30 - - BGEU X30, X28, loop4 - BEQZ X30, done - -loop1: - MOV 0(X5), X10 // x - - SUB X29, X10, X12 // z = x - b - SLTU X12, X10, X29 // next b - - MOV X12, 0(X7) // z - - ADD $8, X5 - ADD $8, X7 - SUB $1, X30 - - BNEZ X30, loop1 - -done: - MOV X29, c+56(FP) // return b - RET - TEXT ·lshVU(SB),NOSPLIT,$0 JMP ·lshVU_g(SB) diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s index 57b263a4c3..a03660be62 100644 --- a/src/math/big/arith_s390x.s +++ b/src/math/big/arith_s390x.s @@ -500,188 +500,6 @@ E1: MOVD R4, c+72(FP) // return c RET -TEXT ·addVW(SB), NOSPLIT, $0 - MOVD z_len+8(FP), R5 // length of z - MOVD x+24(FP), R6 - MOVD y+48(FP), R7 // c = y - MOVD z+0(FP), R8 - - CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return - - // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag. - ADDC 0(R6), R7 - MOVD R7, 0(R8) - CMPBEQ R5, $1, returnResult // len(z) == 1 - MOVD $0, R9 - ADDE 8(R6), R9 - MOVD R9, 8(R8) - CMPBEQ R5, $2, returnResult // len(z) == 2 - - // Update the counters - MOVD $16, R12 // i = 2 - MOVD $-2(R5), R5 // n = n - 2 - -loopOverEachWord: - BRC $12, copySetup // carry = 0, copy the rest - MOVD $1, R9 - - // Originally we used the carry flag generated in the previous iteration - // (i.e: ADDE could be used here to do the addition). However, since we - // already know carry is 1 (otherwise we will go to copy section), we can use - // ADDC here so the current iteration does not depend on the carry flag - // generated in the previous iteration. This could be useful when branch prediction happens. - ADDC 0(R6)(R12*1), R9 - MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c - - MOVD $8(R12), R12 // i++ - BRCTG R5, loopOverEachWord // n-- - -// Return the current carry value -returnResult: - MOVD $0, R0 - ADDE R0, R0 - MOVD R0, c+56(FP) - RET - -// Update position of x(R6) and z(R8) based on the current counter value and perform copying. -// With the assumption that x and z will not overlap with each other or x and z will -// point to same memory region, we can use a faster version of copy using only MVC here. -// In the following implementation, we have three copy loops, each copying a word, 4 words, and -// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. -copySetup: - ADD R12, R6 - ADD R12, R8 - - CMPBGE R5, $4, mediumLoop - -smallLoop: // does a loop unrolling to copy word when n < 4 - CMPBEQ R5, $0, returnZero - MVC $8, 0(R6), 0(R8) - CMPBEQ R5, $1, returnZero - MVC $8, 8(R6), 8(R8) - CMPBEQ R5, $2, returnZero - MVC $8, 16(R6), 16(R8) - -returnZero: - MOVD $0, c+56(FP) // return 0 as carry - RET - -mediumLoop: - CMPBLT R5, $4, smallLoop - CMPBLT R5, $32, mediumLoopBody - -largeLoop: // Copying 256 bytes at a time. - MVC $256, 0(R6), 0(R8) - MOVD $256(R6), R6 - MOVD $256(R8), R8 - MOVD $-32(R5), R5 - CMPBGE R5, $32, largeLoop - BR mediumLoop - -mediumLoopBody: // Copying 32 bytes at a time - MVC $32, 0(R6), 0(R8) - MOVD $32(R6), R6 - MOVD $32(R8), R8 - MOVD $-4(R5), R5 - CMPBGE R5, $4, mediumLoopBody - BR smallLoop - -returnC: - MOVD R7, c+56(FP) - RET - -TEXT ·subVW(SB), NOSPLIT, $0 - MOVD z_len+8(FP), R5 - MOVD x+24(FP), R6 - MOVD y+48(FP), R7 // The borrow bit passed in - MOVD z+0(FP), R8 - MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it. - - CMPBEQ R5, $0, returnC // len(z) == 0, have an early return - - // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag - MOVD 0(R6), R9 - SUBC R7, R9 - MOVD R9, 0(R8) - CMPBEQ R5, $1, returnResult - MOVD 8(R6), R9 - SUBE R0, R9 - MOVD R9, 8(R8) - CMPBEQ R5, $2, returnResult - - // Update the counters - MOVD $16, R12 // i = 2 - MOVD $-2(R5), R5 // n = n - 2 - -loopOverEachWord: - BRC $3, copySetup // no borrow, copy the rest - MOVD 0(R6)(R12*1), R9 - - // Originally we used the borrow flag generated in the previous iteration - // (i.e: SUBE could be used here to do the subtraction). However, since we - // already know borrow is 1 (otherwise we will go to copy section), we can - // use SUBC here so the current iteration does not depend on the borrow flag - // generated in the previous iteration. This could be useful when branch prediction happens. - SUBC $1, R9 - MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1 - - MOVD $8(R12), R12 // i++ - BRCTG R5, loopOverEachWord // n-- - -// return the current borrow value -returnResult: - SUBE R0, R0 - NEG R0, R0 - MOVD R0, c+56(FP) - RET - -// Update position of x(R6) and z(R8) based on the current counter value and perform copying. -// With the assumption that x and z will not overlap with each other or x and z will -// point to same memory region, we can use a faster version of copy using only MVC here. -// In the following implementation, we have three copy loops, each copying a word, 4 words, and -// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. -copySetup: - ADD R12, R6 - ADD R12, R8 - - CMPBGE R5, $4, mediumLoop - -smallLoop: // does a loop unrolling to copy word when n < 4 - CMPBEQ R5, $0, returnZero - MVC $8, 0(R6), 0(R8) - CMPBEQ R5, $1, returnZero - MVC $8, 8(R6), 8(R8) - CMPBEQ R5, $2, returnZero - MVC $8, 16(R6), 16(R8) - -returnZero: - MOVD $0, c+56(FP) // return 0 as borrow - RET - -mediumLoop: - CMPBLT R5, $4, smallLoop - CMPBLT R5, $32, mediumLoopBody - -largeLoop: // Copying 256 bytes at a time - MVC $256, 0(R6), 0(R8) - MOVD $256(R6), R6 - MOVD $256(R8), R8 - MOVD $-32(R5), R5 - CMPBGE R5, $32, largeLoop - BR mediumLoop - -mediumLoopBody: // Copying 32 bytes at a time - MVC $32, 0(R6), 0(R8) - MOVD $32(R6), R6 - MOVD $32(R8), R8 - MOVD $-4(R5), R5 - CMPBGE R5, $4, mediumLoopBody - BR smallLoop - -returnC: - MOVD R7, c+56(FP) - RET - // func lshVU(z, x []Word, s uint) (c Word) TEXT ·lshVU(SB), NOSPLIT, $0 BR ·lshVU_g(SB) diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go index b6e7304a13..bd9f96870b 100644 --- a/src/math/big/arith_test.go +++ b/src/math/big/arith_test.go @@ -28,8 +28,8 @@ var shifts = []uint{1, 2, 3, _W/4 - 1, _W / 4, _W/4 + 1, _W/2 - 1, _W / 2, _W/2 func TestAddVV(t *testing.T) { testVV(t, "addVV", addVV, addVV_g) } func TestSubVV(t *testing.T) { testVV(t, "subVV", subVV, subVV_g) } -func TestAddVW(t *testing.T) { testVW(t, "addVW", addVW, addVW_g, words4) } -func TestSubVW(t *testing.T) { testVW(t, "subVW", subVW, subVW_g, words4) } +func TestAddVW(t *testing.T) { testVW(t, "addVW", addVW, addVW_ref, words4) } +func TestSubVW(t *testing.T) { testVW(t, "subVW", subVW, subVW_ref, words4) } func TestLshVU(t *testing.T) { testVU(t, "lshVU", lshVU, lshVU_g, shifts) } func TestRshVU(t *testing.T) { testVU(t, "rshVU", rshVU, rshVU_g, shifts) } func TestMulAddVWW(t *testing.T) { testVWW(t, "mulAddVWW", mulAddVWW, mulAddVWW_g, muls) } @@ -865,21 +865,15 @@ func benchVV(fn func(z, x, y []Word) Word) benchFunc { } func BenchmarkAddVW(b *testing.B) { - bench(b, "/impl=asm/data=random", benchVW(addVW, 123)) - bench(b, "/impl=asm/data=carry", benchCarryVW(addVW, ^Word(0), 1)) - bench(b, "/impl=asm/data=shortcut", benchShortVW(addVW, 123)) - bench(b, "/impl=go/data=random", benchVW(addVW_g, 123)) - bench(b, "/impl=go/data=carry", benchCarryVW(addVW_g, ^Word(0), 1)) - bench(b, "/impl=go/data=shortcut", benchShortVW(addVW_g, 123)) + bench(b, "/data=random", benchVW(addVW, 123)) + bench(b, "/data=carry", benchCarryVW(addVW, ^Word(0), 1)) + bench(b, "/data=shortcut", benchShortVW(addVW, 123)) } func BenchmarkSubVW(b *testing.B) { - bench(b, "/impl=asm/data=random", benchVW(subVW, 123)) - bench(b, "/impl=asm/data=carry", benchCarryVW(subVW, 0, 1)) - bench(b, "/impl=asm/data=shortcut", benchShortVW(subVW, 123)) - bench(b, "/impl=go/data=random", benchVW(subVW_g, 123)) - bench(b, "/impl=go/data=carry", benchCarryVW(subVW_g, 0, 1)) - bench(b, "/impl=go/data=shortcut", benchShortVW(subVW_g, 123)) + bench(b, "/data=random", benchVW(subVW, 123)) + bench(b, "/data=carry", benchCarryVW(subVW, 0, 1)) + bench(b, "/data=shortcut", benchShortVW(subVW, 123)) } func benchVW(fn func(z, x []Word, w Word) Word, w Word) benchFunc { diff --git a/src/math/big/arith_wasm.s b/src/math/big/arith_wasm.s index 8aadeaa28d..3a9aa4ddcb 100644 --- a/src/math/big/arith_wasm.s +++ b/src/math/big/arith_wasm.s @@ -12,12 +12,6 @@ TEXT ·addVV(SB),NOSPLIT,$0 TEXT ·subVV(SB),NOSPLIT,$0 JMP ·subVV_g(SB) -TEXT ·addVW(SB),NOSPLIT,$0 - JMP ·addVW_g(SB) - -TEXT ·subVW(SB),NOSPLIT,$0 - JMP ·subVW_g(SB) - TEXT ·lshVU(SB),NOSPLIT,$0 JMP ·lshVU_g(SB)