math/big: replace addMulVVW with addMulVVWW

addMulVVW is an unnecessarily special case.
All other assembly routines taking []Word (V as in vector) arguments
take separate source and destination. For example:

	addVV: z = x+y
	mulAddVWW: z = x*m+a

addMulVVW uses the z parameter as both destination and source:

	addMulVVW: z = z+x*m

Even looking at the signatures is confusing: all the VV routines take
two input vectors x and y, but addMulVVW takes only x: where is y?
(The answer is that the two inputs are z and x.)

It would be nice to fix this, both for understandability and regularity,
and to simplify a future assembly generator.

We cannot remove or redefine addMulVVW, because it has been used
in linknames. Instead, the CL adds a new final addend argument ‘a’
like in mulAddVWW, making the natural name addMulVVWW
(two input vectors, two input words):

	addMulVVWW: z = x+y*m+a

This CL updates all the assembly implementations to rename the
inputs z, x, y -> x, y, m, and then introduces a separate destination z.

Change-Id: Ib76c80b53f6d1f4a901f663566e9c4764bb20488
Reviewed-on: https://go-review.googlesource.com/c/go/+/664895
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
This commit is contained in:
Russ Cox 2025-04-05 14:29:00 -04:00
parent 037112464b
commit 4dffdd797b
17 changed files with 226 additions and 199 deletions

View File

@ -194,10 +194,11 @@ func mulAddVWW_g(z, x []Word, y, r Word) (c Word) {
return
}
func addMulVVW_g(z, x []Word, y Word) (c Word) {
func addMulVVWW_g(z, x, y []Word, m, a Word) (c Word) {
c = a
// The comment near the top of this file discusses this for loop condition.
for i := 0; i < len(z) && i < len(x); i++ {
z1, z0 := mulAddWWW_g(x[i], y, z[i])
for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
z1, z0 := mulAddWWW_g(y[i], m, x[i])
lo, cc := bits.Add(uint(z0), uint(c), 0)
c, z[i] = Word(cc), Word(lo)
c += z1

View File

@ -177,12 +177,12 @@ X9b: MOVL $0, c+28(FP)
RET
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), BP
MOVL r+28(FP), CX // c = r
MOVL m+24(FP), BP
MOVL a+28(FP), CX // c = a
MOVL z_len+4(FP), BX
LEAL (DI)(BX*4), DI
LEAL (SI)(BX*4), SI
@ -204,23 +204,25 @@ E5: CMPL BX, $0 // i < 0
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), BP
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
MOVL z+0(FP), BP
MOVL x+12(FP), DI
MOVL y+24(FP), SI
MOVL a+40(FP), CX
MOVL z_len+4(FP), BX
LEAL (DI)(BX*4), DI
LEAL (SI)(BX*4), SI
LEAL (BP)(BX*4), BP
NEGL BX // i = -n
MOVL $0, CX // c = 0
JMP E6
L6: MOVL (SI)(BX*4), AX
MULL BP
MULL m+36(FP)
ADDL CX, AX
ADCL $0, DX
ADDL AX, (DI)(BX*4)
ADDL (DI)(BX*4), AX
MOVL AX, (BP)(BX*4)
ADCL $0, DX
MOVL DX, CX
ADDL $1, BX // i++
@ -228,7 +230,7 @@ L6: MOVL (SI)(BX*4), AX
E6: CMPL BX, $0 // i < 0
JL L6
MOVL CX, c+28(FP)
MOVL CX, c+44(FP)
RET

View File

@ -306,12 +306,12 @@ X9b: MOVQ $0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ y+48(FP), R9
MOVQ r+56(FP), CX // c = r
MOVQ m+48(FP), R9
MOVQ a+56(FP), CX // c = a
MOVQ z_len+8(FP), R11
MOVQ $0, BX // i = 0
@ -366,16 +366,17 @@ E5: CMPQ BX, R11 // i < n
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
CMPB ·support_adx(SB), $1
JEQ adx
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ y+48(FP), R9
MOVQ z+0(FP), R14
MOVQ x+24(FP), R10
MOVQ y+48(FP), R8
MOVQ m+72(FP), R9
MOVQ z_len+8(FP), R11
MOVQ $0, BX // i = 0
MOVQ $0, CX // c = 0
MOVQ a+80(FP), CX // c = 0
MOVQ R11, R12
ANDQ $-2, R12
CMPQ R11, $2
@ -390,7 +391,7 @@ A6:
ADDQ CX, AX
ADCQ $0, DX
MOVQ DX, CX
MOVQ AX, (R10)(BX*8)
MOVQ AX, (R14)(BX*8)
MOVQ (8)(R8)(BX*8), AX
MULQ R9
@ -399,7 +400,7 @@ A6:
ADDQ CX, AX
ADCQ $0, DX
MOVQ DX, CX
MOVQ AX, (8)(R10)(BX*8)
MOVQ AX, (8)(R14)(BX*8)
ADDQ $2, BX
CMPQ BX, R12
@ -410,7 +411,8 @@ L6: MOVQ (R8)(BX*8), AX
MULQ R9
ADDQ CX, AX
ADCQ $0, DX
ADDQ AX, (R10)(BX*8)
ADDQ (R10)(BX*8), AX
MOVQ AX, (R14)(BX*8)
ADCQ $0, DX
MOVQ DX, CX
ADDQ $1, BX // i++
@ -418,21 +420,22 @@ L6: MOVQ (R8)(BX*8), AX
E6: CMPQ BX, R11 // i < n
JL L6
MOVQ CX, c+56(FP)
MOVQ CX, c+88(FP)
RET
adx:
MOVQ z_len+8(FP), R11
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ y+48(FP), DX
MOVQ z+0(FP), R14
MOVQ x+24(FP), R10
MOVQ y+48(FP), R8
MOVQ m+72(FP), DX
MOVQ $0, BX // i = 0
MOVQ $0, CX // carry
CMPQ R11, $8
JAE adx_loop_header
CMPQ BX, R11
JL adx_short
MOVQ CX, c+56(FP)
MOVQ CX, c+88(FP)
RET
adx_loop_header:
@ -448,52 +451,54 @@ adx_loop:
MULXQ 8(R8), AX, CX
ADCXQ DI, AX
ADOXQ 8(R10), AX
MOVQ AX, 8(R10)
MOVQ AX, 8(R14)
MULXQ 16(R8), SI, DI
ADCXQ CX, SI
ADOXQ 16(R10), SI
MOVQ SI, 16(R10)
MOVQ SI, 16(R14)
MULXQ 24(R8), AX, CX
ADCXQ DI, AX
ADOXQ 24(R10), AX
MOVQ AX, 24(R10)
MOVQ AX, 24(R14)
MULXQ 32(R8), SI, DI
ADCXQ CX, SI
ADOXQ 32(R10), SI
MOVQ SI, 32(R10)
MOVQ SI, 32(R14)
MULXQ 40(R8), AX, CX
ADCXQ DI, AX
ADOXQ 40(R10), AX
MOVQ AX, 40(R10)
MOVQ AX, 40(R14)
MULXQ 48(R8), SI, DI
ADCXQ CX, SI
ADOXQ 48(R10), SI
MOVQ SI, 48(R10)
MOVQ SI, 48(R14)
MULXQ 56(R8), AX, CX
ADCXQ DI, AX
ADOXQ 56(R10), AX
MOVQ AX, 56(R10)
MOVQ AX, 56(R14)
ADCXQ R9, CX
ADOXQ R9, CX
ADDQ $64, R8
ADDQ $64, R10
ADDQ $64, R14
ADDQ $8, BX
CMPQ BX, R13
JL adx_loop
MOVQ z+0(FP), R10
MOVQ x+24(FP), R8
MOVQ z+0(FP), R14
MOVQ x+24(FP), R10
MOVQ y+48(FP), R8
CMPQ BX, R11
JL adx_short
MOVQ CX, c+56(FP)
MOVQ CX, c+88(FP)
RET
adx_short:
@ -508,7 +513,7 @@ adx_short:
CMPQ BX, R11
JL adx_short
MOVQ CX, c+56(FP)
MOVQ CX, c+88(FP)
RET

View File

@ -215,14 +215,14 @@ X6:
RET
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVW $0, R0
MOVW z+0(FP), R1
MOVW z_len+4(FP), R5
MOVW x+12(FP), R2
MOVW y+24(FP), R3
MOVW r+28(FP), R4
MOVW m+24(FP), R3
MOVW a+28(FP), R4
ADD R5<<2, R1, R5
B E8
@ -242,15 +242,16 @@ E8:
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
MOVW $0, R0
MOVW z+0(FP), R1
MOVW z+0(FP), R9
MOVW x+12(FP), R1
MOVW z_len+4(FP), R5
MOVW x+12(FP), R2
MOVW y+24(FP), R3
MOVW y+24(FP), R2
MOVW m+36(FP), R3
ADD R5<<2, R1, R5
MOVW $0, R4
MOVW a+40(FP), R4
B E9
// word loop
@ -259,14 +260,14 @@ L9:
MULLU R6, R3, (R7, R6)
ADD.S R4, R6
ADC R0, R7
MOVW 0(R1), R4
MOVW.P 4(R1), R4
ADD.S R4, R6
ADC R0, R7
MOVW.P R6, 4(R1)
MOVW.P R6, 4(R9)
MOVW R7, R4
E9:
TEQ R1, R5
BNE L9
MOVW R4, c+28(FP)
MOVW R4, c+44(FP)
RET

View File

@ -425,13 +425,13 @@ len0:
RET
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVD z+0(FP), R1
MOVD z_len+8(FP), R0
MOVD x+24(FP), R2
MOVD y+48(FP), R3
MOVD r+56(FP), R4
MOVD m+48(FP), R3
MOVD a+56(FP), R4
// c, z = x * y + r
TBZ $0, R0, two
MOVD.P 8(R2), R5
@ -483,33 +483,36 @@ done:
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0
MOVD z+0(FP), R1
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
MOVD z+0(FP), R22
MOVD x+24(FP), R1
MOVD z_len+8(FP), R0
MOVD x+24(FP), R2
MOVD y+48(FP), R3
MOVD $0, R4
MOVD y+48(FP), R2
MOVD m+72(FP), R3
MOVD a+80(FP), R4
TBZ $0, R0, two
MOVD.P 8(R2), R5
MOVD (R1), R6
MOVD.P 8(R1), R6
MUL R5, R3, R7
UMULH R5, R3, R8
ADDS R4, R7
ADC $0, R8
ADDS R7, R6
ADC $0, R8, R4
MOVD.P R6, 8(R1)
MOVD.P R6, 8(R22)
SUB $1, R0
two:
TBZ $1, R0, loop
LDP.P 16(R2), (R5, R10)
LDP (R1), (R6, R11)
LDP.P 16(R1), (R6, R11)
MUL R10, R3, R13
UMULH R10, R3, R12
@ -525,7 +528,7 @@ two:
ADCS R8, R11
ADC $0, R12, R4
STP.P (R6, R11), 16(R1)
STP.P (R6, R11), 16(R22)
SUB $2, R0
// The main loop of this code operates on a block of 4 words every iteration
@ -538,12 +541,12 @@ loop:
LDP.P 16(R2), (R5, R6)
LDP.P 16(R2), (R7, R8)
LDP (R1), (R9, R10)
LDP.P 16(R1), (R9, R10)
ADDS R4, R9
MUL R6, R3, R14
ADCS R14, R10
MUL R7, R3, R15
LDP 16(R1), (R11, R12)
LDP.P 16(R1), (R11, R12)
ADCS R15, R11
MUL R8, R3, R16
ADCS R16, R12
@ -555,18 +558,18 @@ loop:
UMULH R5, R3, R17
ADCS R17, R10
UMULH R6, R3, R21
STP.P (R9, R10), 16(R1)
STP.P (R9, R10), 16(R22)
ADCS R21, R11
UMULH R7, R3, R19
ADCS R19, R12
STP.P (R11, R12), 16(R1)
STP.P (R11, R12), 16(R22)
ADC $0, R20, R4
SUB $4, R0
B loop
done:
MOVD R4, c+56(FP)
MOVD R4, c+88(FP)
RET

View File

@ -83,9 +83,9 @@ func shrVU(z, x []Word, s uint) (c Word)
//
//go:linkname mulAddVWW
//go:noescape
func mulAddVWW(z, x []Word, y, r Word) (c Word)
func mulAddVWW(z, x []Word, m, a Word) (c Word)
// addMulVVW should be an internal detail,
// addMulVVW should be an internal detail (and a stale one at that),
// but widely used packages access it using linkname.
// Notable members of the hall of shame include:
// - github.com/remyoudompheng/bigfft
@ -94,5 +94,11 @@ func mulAddVWW(z, x []Word, y, r Word) (c Word)
// See go.dev/issue/67401.
//
//go:linkname addMulVVW
func addMulVVW(z, x []Word, y Word) (c Word) {
return addMulVVWW(z, z, x, y, 0)
}
// addMulVVWW sets z = x+y*m+a.
//
//go:noescape
func addMulVVW(z, x []Word, y Word) (c Word)
func addMulVVWW(z, x, y []Word, m, a Word) (c Word)

View File

@ -44,6 +44,6 @@ func mulAddVWW(z, x []Word, y, r Word) (c Word) {
return mulAddVWW_g(z, x, y, r)
}
func addMulVVW(z, x []Word, y Word) (c Word) {
return addMulVVW_g(z, x, y)
func addMulVVWW(z, x, y []Word, m, a Word) (c Word) {
return addMulVVWW_g(z, x, y, m, a)
}

View File

@ -30,5 +30,5 @@ TEXT ·shrVU(SB),NOSPLIT,$0
TEXT ·mulAddVWW(SB),NOSPLIT,$0
JMP ·mulAddVWW_g(SB)
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)

View File

@ -30,6 +30,6 @@ TEXT ·shrVU(SB),NOSPLIT,$0
TEXT ·mulAddVWW(SB),NOSPLIT,$0
JMP ·mulAddVWW_g(SB)
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)

View File

@ -30,6 +30,6 @@ TEXT ·shrVU(SB),NOSPLIT,$0
TEXT ·mulAddVWW(SB),NOSPLIT,$0
JMP ·mulAddVWW_g(SB)
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)

View File

@ -391,7 +391,7 @@ zeroshift:
CMPU R11, R7, CR2 // < len?
BLT CR2, backward // there is overlap, copy backwards
MOVD $0, R14
// shlVU processes backwards, but added a forward copy option
// shlVU processes backwards, but added a forward copy option
// since its faster on POWER
repeat:
MOVD (R6)(R14), R15 // Copy 8 bytes at a time
@ -458,7 +458,7 @@ loopback:
BLE loopback
CMP R8, R4 // Are we at the last element?
BEQ loopexit
scalar:
scalar:
ADD $-1, R8, R10
SLD $3, R10
MOVD (R6)(R10),R11
@ -496,12 +496,12 @@ done:
MOVD R0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 // R9 = y
MOVD r+56(FP), R4 // R4 = r = c
MOVD m+48(FP), R9 // R9 = m
MOVD a+56(FP), R4 // R4 = a = c
MOVD z_len+8(FP), R11 // R11 = z_len
CMP R11, $0
@ -587,59 +587,61 @@ done:
MOVD R4, c+64(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD z+0(FP), R3 // R3 = z[]
MOVD x+24(FP), R4 // R4 = x[]
MOVD y+48(FP), R5 // R5 = y
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R22 // R22 = z[]
MOVD x+24(FP), R3 // R3 = x[]
MOVD y+48(FP), R4 // R4 = y[]
MOVD m+72(FP), R5 // R5 = m
MOVD z_len+8(FP), R6 // R6 = z_len
CMP R6, $4
MOVD R0, R9 // R9 = c = 0
MOVD a+80(FP), R9 // R9 = c = a
BLT tail
SRD $2, R6, R7
MOVD R7, CTR // Initialize loop counter
PCALIGN $16
loop:
MOVD 0(R4), R14 // x[i]
MOVD 8(R4), R16 // x[i+1]
MOVD 16(R4), R18 // x[i+2]
MOVD 24(R4), R20 // x[i+3]
MOVD 0(R3), R15 // z[i]
MOVD 8(R3), R17 // z[i+1]
MOVD 16(R3), R19 // z[i+2]
MOVD 24(R3), R21 // z[i+3]
MULLD R5, R14, R10 // low x[i]*y
MULHDU R5, R14, R11 // high x[i]*y
MOVD 0(R4), R14 // y[i]
MOVD 8(R4), R16 // y[i+1]
MOVD 16(R4), R18 // y[i+2]
MOVD 24(R4), R20 // y[i+3]
MOVD 0(R3), R15 // x[i]
MOVD 8(R3), R17 // x[i+1]
MOVD 16(R3), R19 // x[i+2]
MOVD 24(R3), R21 // x[i+3]
MULLD R5, R14, R10 // low y[i]*m
MULHDU R5, R14, R11 // high y[i]*m
ADDC R15, R10
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MULLD R5, R16, R14 // low x[i+1]*y
MULHDU R5, R16, R15 // high x[i+1]*y
MULLD R5, R16, R14 // low y[i+1]*m
MULHDU R5, R16, R15 // high y[i+1]*m
ADDC R17, R14
ADDZE R15
ADDC R9, R14
ADDZE R15, R9
MULLD R5, R18, R16 // low x[i+2]*y
MULHDU R5, R18, R17 // high x[i+2]*y
MULLD R5, R18, R16 // low y[i+2]*m
MULHDU R5, R18, R17 // high y[i+2]*m
ADDC R19, R16
ADDZE R17
ADDC R9, R16
ADDZE R17, R9
MULLD R5, R20, R18 // low x[i+3]*y
MULHDU R5, R20, R19 // high x[i+3]*y
MULLD R5, R20, R18 // low y[i+3]*m
MULHDU R5, R20, R19 // high y[i+3]*m
ADDC R21, R18
ADDZE R19
ADDC R9, R18
ADDZE R19, R9
MOVD R10, 0(R3) // z[i]
MOVD R14, 8(R3) // z[i+1]
MOVD R16, 16(R3) // z[i+2]
MOVD R18, 24(R3) // z[i+3]
MOVD R10, 0(R22) // z[i]
MOVD R14, 8(R22) // z[i+1]
MOVD R16, 16(R22) // z[i+2]
MOVD R18, 24(R22) // z[i+3]
ADD $32, R3
ADD $32, R4
ADD $32, R22
BDNZ loop
ANDCC $3, R6
@ -657,12 +659,13 @@ tailloop:
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MOVD R10, 0(R3)
MOVD R10, 0(R22)
ADD $8, R3
ADD $8, R4
ADD $8, R22
BDNZ tailloop
done:
MOVD R9, c+56(FP)
MOVD R9, c+88(FP)
RET

View File

@ -301,10 +301,10 @@ TEXT ·shrVU(SB),NOSPLIT,$0
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOV x+24(FP), X5
MOV y+48(FP), X6
MOV m+48(FP), X6
MOV z+0(FP), X7
MOV z_len+8(FP), X30
MOV r+56(FP), X29
MOV a+56(FP), X29
MOV $4, X28
@ -317,26 +317,26 @@ loop4:
MOV 16(X5), X14 // x[2]
MOV 24(X5), X17 // x[3]
MULHU X8, X6, X9 // z_hi[0] = x[0] * y
MUL X8, X6, X8 // z_lo[0] = x[0] * y
MULHU X8, X6, X9 // z_hi[0] = x[0] * m
MUL X8, X6, X8 // z_lo[0] = x[0] * m
ADD X8, X29, X10 // z[0] = z_lo[0] + c
SLTU X8, X10, X23
ADD X23, X9, X29 // next c
MULHU X11, X6, X12 // z_hi[1] = x[1] * y
MUL X11, X6, X11 // z_lo[1] = x[1] * y
MULHU X11, X6, X12 // z_hi[1] = x[1] * m
MUL X11, X6, X11 // z_lo[1] = x[1] * m
ADD X11, X29, X13 // z[1] = z_lo[1] + c
SLTU X11, X13, X23
ADD X23, X12, X29 // next c
MULHU X14, X6, X15 // z_hi[2] = x[2] * y
MUL X14, X6, X14 // z_lo[2] = x[2] * y
MULHU X14, X6, X15 // z_hi[2] = x[2] * m
MUL X14, X6, X14 // z_lo[2] = x[2] * m
ADD X14, X29, X16 // z[2] = z_lo[2] + c
SLTU X14, X16, X23
ADD X23, X15, X29 // next c
MULHU X17, X6, X18 // z_hi[3] = x[3] * y
MUL X17, X6, X17 // z_lo[3] = x[3] * y
MULHU X17, X6, X18 // z_hi[3] = x[3] * m
MUL X17, X6, X17 // z_lo[3] = x[3] * m
ADD X17, X29, X19 // z[3] = z_lo[3] + c
SLTU X17, X19, X23
ADD X23, X18, X29 // next c
@ -356,8 +356,8 @@ loop4:
loop1:
MOV 0(X5), X10 // x
MULHU X10, X6, X12 // z_hi = x * y
MUL X10, X6, X10 // z_lo = x * y
MULHU X10, X6, X12 // z_hi = x * m
MUL X10, X6, X10 // z_lo = x * m
ADD X10, X29, X13 // z_lo + c
SLTU X10, X13, X15
ADD X12, X15, X29 // next c
@ -374,97 +374,100 @@ done:
MOV X29, c+64(FP) // return c
RET
TEXT ·addMulVVW(SB),NOSPLIT,$0
MOV x+24(FP), X5
MOV y+48(FP), X6
MOV z+0(FP), X7
TEXT ·addMulVVWW(SB),NOSPLIT,$0
MOV y+48(FP), X5
MOV m+72(FP), X6
MOV x+24(FP), X7
MOV z+0(FP), X20
MOV z_len+8(FP), X30
MOV $4, X28
MOV $0, X29 // c = 0
MOV a+80(FP), X29 // c = a
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 0(X7), X10 // z[0]
MOV 8(X5), X11 // x[1]
MOV 8(X7), X13 // z[1]
MOV 16(X5), X14 // x[2]
MOV 16(X7), X16 // z[2]
MOV 24(X5), X17 // x[3]
MOV 24(X7), X19 // z[3]
MOV 0(X5), X8 // y[0]
MOV 0(X7), X10 // x[0]
MOV 8(X5), X11 // y[1]
MOV 8(X7), X13 // x[1]
MOV 16(X5), X14 // y[2]
MOV 16(X7), X16 // x[2]
MOV 24(X5), X17 // y[3]
MOV 24(X7), X19 // x[3]
MULHU X8, X6, X9 // z_hi[0] = x[0] * y
MUL X8, X6, X8 // z_lo[0] = x[0] * y
ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0]
MULHU X8, X6, X9 // x_hi[0] = y[0] * m
MUL X8, X6, X8 // x_lo[0] = y[0] * m
ADD X8, X10, X21 // x_lo[0] = y[0] * m + x[0]
SLTU X8, X21, X22
ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0]
ADD X21, X29, X10 // z[0] = x[0] * y + z[0] + c
ADD X9, X22, X9 // x_hi[0] = y[0] * m + x[0]
ADD X21, X29, X10 // x[0] = y[0] * m + x[0] + c
SLTU X21, X10, X22
ADD X9, X22, X29 // next c
MULHU X11, X6, X12 // z_hi[1] = x[1] * y
MUL X11, X6, X11 // z_lo[1] = x[1] * y
ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1]
MULHU X11, X6, X12 // x_hi[1] = y[1] * m
MUL X11, X6, X11 // x_lo[1] = y[1] * m
ADD X11, X13, X21 // x_lo[1] = y[1] * m + x[1]
SLTU X11, X21, X22
ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1]
ADD X21, X29, X13 // z[1] = x[1] * y + z[1] + c
ADD X12, X22, X12 // x_hi[1] = y[1] * m + x[1]
ADD X21, X29, X13 // x[1] = y[1] * m + x[1] + c
SLTU X21, X13, X22
ADD X12, X22, X29 // next c
MULHU X14, X6, X15 // z_hi[2] = x[2] * y
MUL X14, X6, X14 // z_lo[2] = x[2] * y
ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2]
MULHU X14, X6, X15 // x_hi[2] = y[2] * m
MUL X14, X6, X14 // x_lo[2] = y[2] * m
ADD X14, X16, X21 // x_lo[2] = y[2] * m + x[2]
SLTU X14, X21, X22
ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2]
ADD X21, X29, X16 // z[2] = x[2] * y + z[2] + c
ADD X15, X22, X15 // x_hi[2] = y[2] * m + x[2]
ADD X21, X29, X16 // x[2] = y[2] * m + x[2] + c
SLTU X21, X16, X22
ADD X15, X22, X29 // next c
MULHU X17, X6, X18 // z_hi[3] = x[3] * y
MUL X17, X6, X17 // z_lo[3] = x[3] * y
ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3]
MULHU X17, X6, X18 // x_hi[3] = y[3] * m
MUL X17, X6, X17 // x_lo[3] = y[3] * m
ADD X17, X19, X21 // x_lo[3] = y[3] * m + x[3]
SLTU X17, X21, X22
ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3]
ADD X21, X29, X19 // z[3] = x[3] * y + z[3] + c
ADD X18, X22, X18 // x_hi[3] = y[3] * m + x[3]
ADD X21, X29, X19 // x[3] = y[3] * m + x[3] + c
SLTU X21, X19, X22
ADD X18, X22, X29 // next c
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
MOV X10, 0(X20) // z[0]
MOV X13, 8(X20) // z[1]
MOV X16, 16(X20) // z[2]
MOV X19, 24(X20) // z[3]
ADD $32, X5
ADD $32, X7
ADD $32, X20
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
loop1:
MOV 0(X5), X10 // x
MOV 0(X7), X11 // z
MOV 0(X5), X10 // y
MOV 0(X7), X11 // x
MULHU X10, X6, X12 // z_hi = x * y
MUL X10, X6, X10 // z_lo = x * y
ADD X10, X11, X13 // z_lo = x * y + z
MULHU X10, X6, X12 // z_hi = y * m
MUL X10, X6, X10 // z_lo = y * m
ADD X10, X11, X13 // z_lo = y * m + x
SLTU X10, X13, X15
ADD X12, X15, X12 // z_hi = x * y + z
ADD X13, X29, X10 // z = x * y + z + c
ADD X12, X15, X12 // z_hi = y * m + x
ADD X13, X29, X10 // z = y * m + x + c
SLTU X13, X10, X15
ADD X12, X15, X29 // next c
MOV X10, 0(X7) // z
MOV X10, 0(X20) // z
ADD $8, X5
ADD $8, X7
ADD $8, X20
SUB $1, X30
BNEZ X30, loop1
done:
MOV X29, c+56(FP) // return c
MOV X29, c+88(FP) // return c
RET

View File

@ -691,12 +691,12 @@ TEXT ·shrVU(SB), NOSPLIT, $0
BR ·shrVU_g(SB)
// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R2
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD r+56(FP), R4 // c = r
MOVD m+48(FP), R9
MOVD a+56(FP), R4 // c = a
MOVD z_len+8(FP), R5
MOVD $0, R1 // i = 0
MOVD $0, R7 // i*8 = 0
@ -719,18 +719,19 @@ E5:
MOVD R4, c+64(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD z+0(FP), R2
MOVD x+24(FP), R8
MOVD y+48(FP), R9
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R3
MOVD x+24(FP), R2
MOVD y+48(FP), R8
MOVD m+72(FP), R9
MOVD z_len+8(FP), R5
MOVD $0, R1 // i*8 = 0
MOVD $0, R7 // i = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R4 // c = 0
MOVD a+80(FP), R4 // c = 0
MOVD R5, R12
AND $-2, R12
@ -746,7 +747,7 @@ A6:
ADDC R4, R11
ADDE R0, R6
MOVD R6, R4
MOVD R11, (R2)(R1*1)
MOVD R11, (R3)(R1*1)
MOVD (8)(R8)(R1*1), R6
MULHDU R9, R6
@ -756,7 +757,7 @@ A6:
ADDC R4, R11
ADDE R0, R6
MOVD R6, R4
MOVD R11, (8)(R2)(R1*1)
MOVD R11, (8)(R3)(R1*1)
ADD $16, R1 // i*8 + 8
ADD $2, R7 // i++
@ -773,7 +774,7 @@ L6:
ADDC R4, R11
ADDE R0, R6
MOVD R6, R4
MOVD R11, (R2)(R1*1)
MOVD R11, (R3)(R1*1)
ADD $8, R1 // i*8 + 8
ADD $1, R7 // i++
@ -781,6 +782,6 @@ L6:
E6:
CMPBLT R7, R5, L6 // i < n
MOVD R4, c+56(FP)
MOVD R4, c+88(FP)
RET

View File

@ -629,7 +629,7 @@ func BenchmarkMulAddVWW(b *testing.B) {
if isRaceBuilder && n > 1e3 {
continue
}
z := make([]Word, n+1)
z := make([]Word, n)
x := rndV(n)
y := rndW()
r := rndW()
@ -642,18 +642,20 @@ func BenchmarkMulAddVWW(b *testing.B) {
}
}
func BenchmarkAddMulVVW(b *testing.B) {
func BenchmarkAddMulVVWW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {
continue
}
x := rndV(n)
y := rndW()
z := make([]Word, n)
x := rndV(n)
y := rndV(n)
m := rndW()
a := rndW()
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n * _W))
for i := 0; i < b.N; i++ {
addMulVVW(z, x, y)
addMulVVWW(z, x, y, m, a)
}
})
}

View File

@ -27,6 +27,6 @@ TEXT ·shrVU(SB),NOSPLIT,$0
TEXT ·mulAddVWW(SB),NOSPLIT,$0
JMP ·mulAddVWW_g(SB)
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)

View File

@ -197,9 +197,9 @@ func (z nat) montgomery(x, y, m nat, k Word, n int) nat {
var c Word
for i := 0; i < n; i++ {
d := y[i]
c2 := addMulVVW(z[i:n+i], x, d)
c2 := addMulVVWW(z[i:n+i], z[i:n+i], x, d, 0)
t := z[i] * k
c3 := addMulVVW(z[i:n+i], m, t)
c3 := addMulVVWW(z[i:n+i], z[i:n+i], m, t, 0)
cx := c + c2
cy := cx + c3
z[n+i] = cy

View File

@ -126,7 +126,7 @@ func basicSqr(stk *stack, z, x nat) {
// z collects the squares x[i] * x[i]
z[2*i+1], z[2*i] = mulWW(d, d)
// t collects the products x[i] * x[j] where j < i
t[2*i] = addMulVVW(t[i:2*i], x[0:i], d)
t[2*i] = addMulVVWW(t[i:2*i], t[i:2*i], x[0:i], d, 0)
}
t[2*n-1] = shlVU(t[1:2*n-1], t[1:2*n-1], 1) // double the j < i products
addVV(z, z, t) // combine the result
@ -152,7 +152,7 @@ func basicMul(z, x, y nat) {
clear(z[0 : len(x)+len(y)]) // initialize z
for i, d := range y {
if d != 0 {
z[len(x)+i] = addMulVVW(z[i:i+len(x)], x, d)
z[len(x)+i] = addMulVVWW(z[i:i+len(x)], z[i:i+len(x)], x, d, 0)
}
}
}