mirror of
https://github.com/golang/go.git
synced 2025-05-05 15:43:04 +00:00
math/big: replace addMulVVW with addMulVVWW
addMulVVW is an unnecessarily special case. All other assembly routines taking []Word (V as in vector) arguments take separate source and destination. For example: addVV: z = x+y mulAddVWW: z = x*m+a addMulVVW uses the z parameter as both destination and source: addMulVVW: z = z+x*m Even looking at the signatures is confusing: all the VV routines take two input vectors x and y, but addMulVVW takes only x: where is y? (The answer is that the two inputs are z and x.) It would be nice to fix this, both for understandability and regularity, and to simplify a future assembly generator. We cannot remove or redefine addMulVVW, because it has been used in linknames. Instead, the CL adds a new final addend argument ‘a’ like in mulAddVWW, making the natural name addMulVVWW (two input vectors, two input words): addMulVVWW: z = x+y*m+a This CL updates all the assembly implementations to rename the inputs z, x, y -> x, y, m, and then introduces a separate destination z. Change-Id: Ib76c80b53f6d1f4a901f663566e9c4764bb20488 Reviewed-on: https://go-review.googlesource.com/c/go/+/664895 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Alan Donovan <adonovan@google.com>
This commit is contained in:
parent
037112464b
commit
4dffdd797b
@ -194,10 +194,11 @@ func mulAddVWW_g(z, x []Word, y, r Word) (c Word) {
|
||||
return
|
||||
}
|
||||
|
||||
func addMulVVW_g(z, x []Word, y Word) (c Word) {
|
||||
func addMulVVWW_g(z, x, y []Word, m, a Word) (c Word) {
|
||||
c = a
|
||||
// The comment near the top of this file discusses this for loop condition.
|
||||
for i := 0; i < len(z) && i < len(x); i++ {
|
||||
z1, z0 := mulAddWWW_g(x[i], y, z[i])
|
||||
for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
|
||||
z1, z0 := mulAddWWW_g(y[i], m, x[i])
|
||||
lo, cc := bits.Add(uint(z0), uint(c), 0)
|
||||
c, z[i] = Word(cc), Word(lo)
|
||||
c += z1
|
||||
|
@ -177,12 +177,12 @@ X9b: MOVL $0, c+28(FP)
|
||||
RET
|
||||
|
||||
|
||||
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
MOVL z+0(FP), DI
|
||||
MOVL x+12(FP), SI
|
||||
MOVL y+24(FP), BP
|
||||
MOVL r+28(FP), CX // c = r
|
||||
MOVL m+24(FP), BP
|
||||
MOVL a+28(FP), CX // c = a
|
||||
MOVL z_len+4(FP), BX
|
||||
LEAL (DI)(BX*4), DI
|
||||
LEAL (SI)(BX*4), SI
|
||||
@ -204,23 +204,25 @@ E5: CMPL BX, $0 // i < 0
|
||||
RET
|
||||
|
||||
|
||||
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
MOVL z+0(FP), DI
|
||||
MOVL x+12(FP), SI
|
||||
MOVL y+24(FP), BP
|
||||
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
MOVL z+0(FP), BP
|
||||
MOVL x+12(FP), DI
|
||||
MOVL y+24(FP), SI
|
||||
MOVL a+40(FP), CX
|
||||
MOVL z_len+4(FP), BX
|
||||
LEAL (DI)(BX*4), DI
|
||||
LEAL (SI)(BX*4), SI
|
||||
LEAL (BP)(BX*4), BP
|
||||
NEGL BX // i = -n
|
||||
MOVL $0, CX // c = 0
|
||||
JMP E6
|
||||
|
||||
L6: MOVL (SI)(BX*4), AX
|
||||
MULL BP
|
||||
MULL m+36(FP)
|
||||
ADDL CX, AX
|
||||
ADCL $0, DX
|
||||
ADDL AX, (DI)(BX*4)
|
||||
ADDL (DI)(BX*4), AX
|
||||
MOVL AX, (BP)(BX*4)
|
||||
ADCL $0, DX
|
||||
MOVL DX, CX
|
||||
ADDL $1, BX // i++
|
||||
@ -228,7 +230,7 @@ L6: MOVL (SI)(BX*4), AX
|
||||
E6: CMPL BX, $0 // i < 0
|
||||
JL L6
|
||||
|
||||
MOVL CX, c+28(FP)
|
||||
MOVL CX, c+44(FP)
|
||||
RET
|
||||
|
||||
|
||||
|
@ -306,12 +306,12 @@ X9b: MOVQ $0, c+56(FP)
|
||||
RET
|
||||
|
||||
|
||||
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
MOVQ z+0(FP), R10
|
||||
MOVQ x+24(FP), R8
|
||||
MOVQ y+48(FP), R9
|
||||
MOVQ r+56(FP), CX // c = r
|
||||
MOVQ m+48(FP), R9
|
||||
MOVQ a+56(FP), CX // c = a
|
||||
MOVQ z_len+8(FP), R11
|
||||
MOVQ $0, BX // i = 0
|
||||
|
||||
@ -366,16 +366,17 @@ E5: CMPQ BX, R11 // i < n
|
||||
RET
|
||||
|
||||
|
||||
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
CMPB ·support_adx(SB), $1
|
||||
JEQ adx
|
||||
MOVQ z+0(FP), R10
|
||||
MOVQ x+24(FP), R8
|
||||
MOVQ y+48(FP), R9
|
||||
MOVQ z+0(FP), R14
|
||||
MOVQ x+24(FP), R10
|
||||
MOVQ y+48(FP), R8
|
||||
MOVQ m+72(FP), R9
|
||||
MOVQ z_len+8(FP), R11
|
||||
MOVQ $0, BX // i = 0
|
||||
MOVQ $0, CX // c = 0
|
||||
MOVQ a+80(FP), CX // c = 0
|
||||
MOVQ R11, R12
|
||||
ANDQ $-2, R12
|
||||
CMPQ R11, $2
|
||||
@ -390,7 +391,7 @@ A6:
|
||||
ADDQ CX, AX
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, CX
|
||||
MOVQ AX, (R10)(BX*8)
|
||||
MOVQ AX, (R14)(BX*8)
|
||||
|
||||
MOVQ (8)(R8)(BX*8), AX
|
||||
MULQ R9
|
||||
@ -399,7 +400,7 @@ A6:
|
||||
ADDQ CX, AX
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, CX
|
||||
MOVQ AX, (8)(R10)(BX*8)
|
||||
MOVQ AX, (8)(R14)(BX*8)
|
||||
|
||||
ADDQ $2, BX
|
||||
CMPQ BX, R12
|
||||
@ -410,7 +411,8 @@ L6: MOVQ (R8)(BX*8), AX
|
||||
MULQ R9
|
||||
ADDQ CX, AX
|
||||
ADCQ $0, DX
|
||||
ADDQ AX, (R10)(BX*8)
|
||||
ADDQ (R10)(BX*8), AX
|
||||
MOVQ AX, (R14)(BX*8)
|
||||
ADCQ $0, DX
|
||||
MOVQ DX, CX
|
||||
ADDQ $1, BX // i++
|
||||
@ -418,21 +420,22 @@ L6: MOVQ (R8)(BX*8), AX
|
||||
E6: CMPQ BX, R11 // i < n
|
||||
JL L6
|
||||
|
||||
MOVQ CX, c+56(FP)
|
||||
MOVQ CX, c+88(FP)
|
||||
RET
|
||||
|
||||
adx:
|
||||
MOVQ z_len+8(FP), R11
|
||||
MOVQ z+0(FP), R10
|
||||
MOVQ x+24(FP), R8
|
||||
MOVQ y+48(FP), DX
|
||||
MOVQ z+0(FP), R14
|
||||
MOVQ x+24(FP), R10
|
||||
MOVQ y+48(FP), R8
|
||||
MOVQ m+72(FP), DX
|
||||
MOVQ $0, BX // i = 0
|
||||
MOVQ $0, CX // carry
|
||||
CMPQ R11, $8
|
||||
JAE adx_loop_header
|
||||
CMPQ BX, R11
|
||||
JL adx_short
|
||||
MOVQ CX, c+56(FP)
|
||||
MOVQ CX, c+88(FP)
|
||||
RET
|
||||
|
||||
adx_loop_header:
|
||||
@ -448,52 +451,54 @@ adx_loop:
|
||||
MULXQ 8(R8), AX, CX
|
||||
ADCXQ DI, AX
|
||||
ADOXQ 8(R10), AX
|
||||
MOVQ AX, 8(R10)
|
||||
MOVQ AX, 8(R14)
|
||||
|
||||
MULXQ 16(R8), SI, DI
|
||||
ADCXQ CX, SI
|
||||
ADOXQ 16(R10), SI
|
||||
MOVQ SI, 16(R10)
|
||||
MOVQ SI, 16(R14)
|
||||
|
||||
MULXQ 24(R8), AX, CX
|
||||
ADCXQ DI, AX
|
||||
ADOXQ 24(R10), AX
|
||||
MOVQ AX, 24(R10)
|
||||
MOVQ AX, 24(R14)
|
||||
|
||||
MULXQ 32(R8), SI, DI
|
||||
ADCXQ CX, SI
|
||||
ADOXQ 32(R10), SI
|
||||
MOVQ SI, 32(R10)
|
||||
MOVQ SI, 32(R14)
|
||||
|
||||
MULXQ 40(R8), AX, CX
|
||||
ADCXQ DI, AX
|
||||
ADOXQ 40(R10), AX
|
||||
MOVQ AX, 40(R10)
|
||||
MOVQ AX, 40(R14)
|
||||
|
||||
MULXQ 48(R8), SI, DI
|
||||
ADCXQ CX, SI
|
||||
ADOXQ 48(R10), SI
|
||||
MOVQ SI, 48(R10)
|
||||
MOVQ SI, 48(R14)
|
||||
|
||||
MULXQ 56(R8), AX, CX
|
||||
ADCXQ DI, AX
|
||||
ADOXQ 56(R10), AX
|
||||
MOVQ AX, 56(R10)
|
||||
MOVQ AX, 56(R14)
|
||||
|
||||
ADCXQ R9, CX
|
||||
ADOXQ R9, CX
|
||||
|
||||
ADDQ $64, R8
|
||||
ADDQ $64, R10
|
||||
ADDQ $64, R14
|
||||
ADDQ $8, BX
|
||||
|
||||
CMPQ BX, R13
|
||||
JL adx_loop
|
||||
MOVQ z+0(FP), R10
|
||||
MOVQ x+24(FP), R8
|
||||
MOVQ z+0(FP), R14
|
||||
MOVQ x+24(FP), R10
|
||||
MOVQ y+48(FP), R8
|
||||
CMPQ BX, R11
|
||||
JL adx_short
|
||||
MOVQ CX, c+56(FP)
|
||||
MOVQ CX, c+88(FP)
|
||||
RET
|
||||
|
||||
adx_short:
|
||||
@ -508,7 +513,7 @@ adx_short:
|
||||
CMPQ BX, R11
|
||||
JL adx_short
|
||||
|
||||
MOVQ CX, c+56(FP)
|
||||
MOVQ CX, c+88(FP)
|
||||
RET
|
||||
|
||||
|
||||
|
@ -215,14 +215,14 @@ X6:
|
||||
RET
|
||||
|
||||
|
||||
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
MOVW $0, R0
|
||||
MOVW z+0(FP), R1
|
||||
MOVW z_len+4(FP), R5
|
||||
MOVW x+12(FP), R2
|
||||
MOVW y+24(FP), R3
|
||||
MOVW r+28(FP), R4
|
||||
MOVW m+24(FP), R3
|
||||
MOVW a+28(FP), R4
|
||||
ADD R5<<2, R1, R5
|
||||
B E8
|
||||
|
||||
@ -242,15 +242,16 @@ E8:
|
||||
RET
|
||||
|
||||
|
||||
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
MOVW $0, R0
|
||||
MOVW z+0(FP), R1
|
||||
MOVW z+0(FP), R9
|
||||
MOVW x+12(FP), R1
|
||||
MOVW z_len+4(FP), R5
|
||||
MOVW x+12(FP), R2
|
||||
MOVW y+24(FP), R3
|
||||
MOVW y+24(FP), R2
|
||||
MOVW m+36(FP), R3
|
||||
ADD R5<<2, R1, R5
|
||||
MOVW $0, R4
|
||||
MOVW a+40(FP), R4
|
||||
B E9
|
||||
|
||||
// word loop
|
||||
@ -259,14 +260,14 @@ L9:
|
||||
MULLU R6, R3, (R7, R6)
|
||||
ADD.S R4, R6
|
||||
ADC R0, R7
|
||||
MOVW 0(R1), R4
|
||||
MOVW.P 4(R1), R4
|
||||
ADD.S R4, R6
|
||||
ADC R0, R7
|
||||
MOVW.P R6, 4(R1)
|
||||
MOVW.P R6, 4(R9)
|
||||
MOVW R7, R4
|
||||
E9:
|
||||
TEQ R1, R5
|
||||
BNE L9
|
||||
|
||||
MOVW R4, c+28(FP)
|
||||
MOVW R4, c+44(FP)
|
||||
RET
|
||||
|
@ -425,13 +425,13 @@ len0:
|
||||
RET
|
||||
|
||||
|
||||
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
MOVD z+0(FP), R1
|
||||
MOVD z_len+8(FP), R0
|
||||
MOVD x+24(FP), R2
|
||||
MOVD y+48(FP), R3
|
||||
MOVD r+56(FP), R4
|
||||
MOVD m+48(FP), R3
|
||||
MOVD a+56(FP), R4
|
||||
// c, z = x * y + r
|
||||
TBZ $0, R0, two
|
||||
MOVD.P 8(R2), R5
|
||||
@ -483,33 +483,36 @@ done:
|
||||
RET
|
||||
|
||||
|
||||
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
MOVD z+0(FP), R1
|
||||
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
MOVD z+0(FP), R22
|
||||
MOVD x+24(FP), R1
|
||||
MOVD z_len+8(FP), R0
|
||||
MOVD x+24(FP), R2
|
||||
MOVD y+48(FP), R3
|
||||
MOVD $0, R4
|
||||
MOVD y+48(FP), R2
|
||||
MOVD m+72(FP), R3
|
||||
MOVD a+80(FP), R4
|
||||
|
||||
TBZ $0, R0, two
|
||||
|
||||
MOVD.P 8(R2), R5
|
||||
MOVD (R1), R6
|
||||
MOVD.P 8(R1), R6
|
||||
|
||||
MUL R5, R3, R7
|
||||
UMULH R5, R3, R8
|
||||
|
||||
ADDS R4, R7
|
||||
ADC $0, R8
|
||||
ADDS R7, R6
|
||||
ADC $0, R8, R4
|
||||
|
||||
MOVD.P R6, 8(R1)
|
||||
MOVD.P R6, 8(R22)
|
||||
SUB $1, R0
|
||||
|
||||
two:
|
||||
TBZ $1, R0, loop
|
||||
|
||||
LDP.P 16(R2), (R5, R10)
|
||||
LDP (R1), (R6, R11)
|
||||
LDP.P 16(R1), (R6, R11)
|
||||
|
||||
MUL R10, R3, R13
|
||||
UMULH R10, R3, R12
|
||||
@ -525,7 +528,7 @@ two:
|
||||
ADCS R8, R11
|
||||
ADC $0, R12, R4
|
||||
|
||||
STP.P (R6, R11), 16(R1)
|
||||
STP.P (R6, R11), 16(R22)
|
||||
SUB $2, R0
|
||||
|
||||
// The main loop of this code operates on a block of 4 words every iteration
|
||||
@ -538,12 +541,12 @@ loop:
|
||||
LDP.P 16(R2), (R5, R6)
|
||||
LDP.P 16(R2), (R7, R8)
|
||||
|
||||
LDP (R1), (R9, R10)
|
||||
LDP.P 16(R1), (R9, R10)
|
||||
ADDS R4, R9
|
||||
MUL R6, R3, R14
|
||||
ADCS R14, R10
|
||||
MUL R7, R3, R15
|
||||
LDP 16(R1), (R11, R12)
|
||||
LDP.P 16(R1), (R11, R12)
|
||||
ADCS R15, R11
|
||||
MUL R8, R3, R16
|
||||
ADCS R16, R12
|
||||
@ -555,18 +558,18 @@ loop:
|
||||
UMULH R5, R3, R17
|
||||
ADCS R17, R10
|
||||
UMULH R6, R3, R21
|
||||
STP.P (R9, R10), 16(R1)
|
||||
STP.P (R9, R10), 16(R22)
|
||||
ADCS R21, R11
|
||||
UMULH R7, R3, R19
|
||||
ADCS R19, R12
|
||||
STP.P (R11, R12), 16(R1)
|
||||
STP.P (R11, R12), 16(R22)
|
||||
ADC $0, R20, R4
|
||||
|
||||
SUB $4, R0
|
||||
B loop
|
||||
|
||||
done:
|
||||
MOVD R4, c+56(FP)
|
||||
MOVD R4, c+88(FP)
|
||||
RET
|
||||
|
||||
|
||||
|
@ -83,9 +83,9 @@ func shrVU(z, x []Word, s uint) (c Word)
|
||||
//
|
||||
//go:linkname mulAddVWW
|
||||
//go:noescape
|
||||
func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
||||
|
||||
// addMulVVW should be an internal detail,
|
||||
// addMulVVW should be an internal detail (and a stale one at that),
|
||||
// but widely used packages access it using linkname.
|
||||
// Notable members of the hall of shame include:
|
||||
// - github.com/remyoudompheng/bigfft
|
||||
@ -94,5 +94,11 @@ func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
// See go.dev/issue/67401.
|
||||
//
|
||||
//go:linkname addMulVVW
|
||||
func addMulVVW(z, x []Word, y Word) (c Word) {
|
||||
return addMulVVWW(z, z, x, y, 0)
|
||||
}
|
||||
|
||||
// addMulVVWW sets z = x+y*m+a.
|
||||
//
|
||||
//go:noescape
|
||||
func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
||||
|
@ -44,6 +44,6 @@ func mulAddVWW(z, x []Word, y, r Word) (c Word) {
|
||||
return mulAddVWW_g(z, x, y, r)
|
||||
}
|
||||
|
||||
func addMulVVW(z, x []Word, y Word) (c Word) {
|
||||
return addMulVVW_g(z, x, y)
|
||||
func addMulVVWW(z, x, y []Word, m, a Word) (c Word) {
|
||||
return addMulVVWW_g(z, x, y, m, a)
|
||||
}
|
||||
|
@ -30,5 +30,5 @@ TEXT ·shrVU(SB),NOSPLIT,$0
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
JMP ·mulAddVWW_g(SB)
|
||||
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVW_g(SB)
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVWW_g(SB)
|
||||
|
@ -30,6 +30,6 @@ TEXT ·shrVU(SB),NOSPLIT,$0
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
JMP ·mulAddVWW_g(SB)
|
||||
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVW_g(SB)
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVWW_g(SB)
|
||||
|
||||
|
@ -30,6 +30,6 @@ TEXT ·shrVU(SB),NOSPLIT,$0
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
JMP ·mulAddVWW_g(SB)
|
||||
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVW_g(SB)
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVWW_g(SB)
|
||||
|
||||
|
@ -391,7 +391,7 @@ zeroshift:
|
||||
CMPU R11, R7, CR2 // < len?
|
||||
BLT CR2, backward // there is overlap, copy backwards
|
||||
MOVD $0, R14
|
||||
// shlVU processes backwards, but added a forward copy option
|
||||
// shlVU processes backwards, but added a forward copy option
|
||||
// since its faster on POWER
|
||||
repeat:
|
||||
MOVD (R6)(R14), R15 // Copy 8 bytes at a time
|
||||
@ -458,7 +458,7 @@ loopback:
|
||||
BLE loopback
|
||||
CMP R8, R4 // Are we at the last element?
|
||||
BEQ loopexit
|
||||
scalar:
|
||||
scalar:
|
||||
ADD $-1, R8, R10
|
||||
SLD $3, R10
|
||||
MOVD (R6)(R10),R11
|
||||
@ -496,12 +496,12 @@ done:
|
||||
MOVD R0, c+56(FP)
|
||||
RET
|
||||
|
||||
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
||||
TEXT ·mulAddVWW(SB), NOSPLIT, $0
|
||||
MOVD z+0(FP), R10 // R10 = z[]
|
||||
MOVD x+24(FP), R8 // R8 = x[]
|
||||
MOVD y+48(FP), R9 // R9 = y
|
||||
MOVD r+56(FP), R4 // R4 = r = c
|
||||
MOVD m+48(FP), R9 // R9 = m
|
||||
MOVD a+56(FP), R4 // R4 = a = c
|
||||
MOVD z_len+8(FP), R11 // R11 = z_len
|
||||
|
||||
CMP R11, $0
|
||||
@ -587,59 +587,61 @@ done:
|
||||
MOVD R4, c+64(FP)
|
||||
RET
|
||||
|
||||
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addMulVVW(SB), NOSPLIT, $0
|
||||
MOVD z+0(FP), R3 // R3 = z[]
|
||||
MOVD x+24(FP), R4 // R4 = x[]
|
||||
MOVD y+48(FP), R5 // R5 = y
|
||||
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
||||
TEXT ·addMulVVWW(SB), NOSPLIT, $0
|
||||
MOVD z+0(FP), R22 // R22 = z[]
|
||||
MOVD x+24(FP), R3 // R3 = x[]
|
||||
MOVD y+48(FP), R4 // R4 = y[]
|
||||
MOVD m+72(FP), R5 // R5 = m
|
||||
MOVD z_len+8(FP), R6 // R6 = z_len
|
||||
|
||||
CMP R6, $4
|
||||
MOVD R0, R9 // R9 = c = 0
|
||||
MOVD a+80(FP), R9 // R9 = c = a
|
||||
BLT tail
|
||||
SRD $2, R6, R7
|
||||
MOVD R7, CTR // Initialize loop counter
|
||||
PCALIGN $16
|
||||
|
||||
loop:
|
||||
MOVD 0(R4), R14 // x[i]
|
||||
MOVD 8(R4), R16 // x[i+1]
|
||||
MOVD 16(R4), R18 // x[i+2]
|
||||
MOVD 24(R4), R20 // x[i+3]
|
||||
MOVD 0(R3), R15 // z[i]
|
||||
MOVD 8(R3), R17 // z[i+1]
|
||||
MOVD 16(R3), R19 // z[i+2]
|
||||
MOVD 24(R3), R21 // z[i+3]
|
||||
MULLD R5, R14, R10 // low x[i]*y
|
||||
MULHDU R5, R14, R11 // high x[i]*y
|
||||
MOVD 0(R4), R14 // y[i]
|
||||
MOVD 8(R4), R16 // y[i+1]
|
||||
MOVD 16(R4), R18 // y[i+2]
|
||||
MOVD 24(R4), R20 // y[i+3]
|
||||
MOVD 0(R3), R15 // x[i]
|
||||
MOVD 8(R3), R17 // x[i+1]
|
||||
MOVD 16(R3), R19 // x[i+2]
|
||||
MOVD 24(R3), R21 // x[i+3]
|
||||
MULLD R5, R14, R10 // low y[i]*m
|
||||
MULHDU R5, R14, R11 // high y[i]*m
|
||||
ADDC R15, R10
|
||||
ADDZE R11
|
||||
ADDC R9, R10
|
||||
ADDZE R11, R9
|
||||
MULLD R5, R16, R14 // low x[i+1]*y
|
||||
MULHDU R5, R16, R15 // high x[i+1]*y
|
||||
MULLD R5, R16, R14 // low y[i+1]*m
|
||||
MULHDU R5, R16, R15 // high y[i+1]*m
|
||||
ADDC R17, R14
|
||||
ADDZE R15
|
||||
ADDC R9, R14
|
||||
ADDZE R15, R9
|
||||
MULLD R5, R18, R16 // low x[i+2]*y
|
||||
MULHDU R5, R18, R17 // high x[i+2]*y
|
||||
MULLD R5, R18, R16 // low y[i+2]*m
|
||||
MULHDU R5, R18, R17 // high y[i+2]*m
|
||||
ADDC R19, R16
|
||||
ADDZE R17
|
||||
ADDC R9, R16
|
||||
ADDZE R17, R9
|
||||
MULLD R5, R20, R18 // low x[i+3]*y
|
||||
MULHDU R5, R20, R19 // high x[i+3]*y
|
||||
MULLD R5, R20, R18 // low y[i+3]*m
|
||||
MULHDU R5, R20, R19 // high y[i+3]*m
|
||||
ADDC R21, R18
|
||||
ADDZE R19
|
||||
ADDC R9, R18
|
||||
ADDZE R19, R9
|
||||
MOVD R10, 0(R3) // z[i]
|
||||
MOVD R14, 8(R3) // z[i+1]
|
||||
MOVD R16, 16(R3) // z[i+2]
|
||||
MOVD R18, 24(R3) // z[i+3]
|
||||
MOVD R10, 0(R22) // z[i]
|
||||
MOVD R14, 8(R22) // z[i+1]
|
||||
MOVD R16, 16(R22) // z[i+2]
|
||||
MOVD R18, 24(R22) // z[i+3]
|
||||
ADD $32, R3
|
||||
ADD $32, R4
|
||||
ADD $32, R22
|
||||
BDNZ loop
|
||||
|
||||
ANDCC $3, R6
|
||||
@ -657,12 +659,13 @@ tailloop:
|
||||
ADDZE R11
|
||||
ADDC R9, R10
|
||||
ADDZE R11, R9
|
||||
MOVD R10, 0(R3)
|
||||
MOVD R10, 0(R22)
|
||||
ADD $8, R3
|
||||
ADD $8, R4
|
||||
ADD $8, R22
|
||||
BDNZ tailloop
|
||||
|
||||
done:
|
||||
MOVD R9, c+56(FP)
|
||||
MOVD R9, c+88(FP)
|
||||
RET
|
||||
|
||||
|
@ -301,10 +301,10 @@ TEXT ·shrVU(SB),NOSPLIT,$0
|
||||
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
MOV x+24(FP), X5
|
||||
MOV y+48(FP), X6
|
||||
MOV m+48(FP), X6
|
||||
MOV z+0(FP), X7
|
||||
MOV z_len+8(FP), X30
|
||||
MOV r+56(FP), X29
|
||||
MOV a+56(FP), X29
|
||||
|
||||
MOV $4, X28
|
||||
|
||||
@ -317,26 +317,26 @@ loop4:
|
||||
MOV 16(X5), X14 // x[2]
|
||||
MOV 24(X5), X17 // x[3]
|
||||
|
||||
MULHU X8, X6, X9 // z_hi[0] = x[0] * y
|
||||
MUL X8, X6, X8 // z_lo[0] = x[0] * y
|
||||
MULHU X8, X6, X9 // z_hi[0] = x[0] * m
|
||||
MUL X8, X6, X8 // z_lo[0] = x[0] * m
|
||||
ADD X8, X29, X10 // z[0] = z_lo[0] + c
|
||||
SLTU X8, X10, X23
|
||||
ADD X23, X9, X29 // next c
|
||||
|
||||
MULHU X11, X6, X12 // z_hi[1] = x[1] * y
|
||||
MUL X11, X6, X11 // z_lo[1] = x[1] * y
|
||||
MULHU X11, X6, X12 // z_hi[1] = x[1] * m
|
||||
MUL X11, X6, X11 // z_lo[1] = x[1] * m
|
||||
ADD X11, X29, X13 // z[1] = z_lo[1] + c
|
||||
SLTU X11, X13, X23
|
||||
ADD X23, X12, X29 // next c
|
||||
|
||||
MULHU X14, X6, X15 // z_hi[2] = x[2] * y
|
||||
MUL X14, X6, X14 // z_lo[2] = x[2] * y
|
||||
MULHU X14, X6, X15 // z_hi[2] = x[2] * m
|
||||
MUL X14, X6, X14 // z_lo[2] = x[2] * m
|
||||
ADD X14, X29, X16 // z[2] = z_lo[2] + c
|
||||
SLTU X14, X16, X23
|
||||
ADD X23, X15, X29 // next c
|
||||
|
||||
MULHU X17, X6, X18 // z_hi[3] = x[3] * y
|
||||
MUL X17, X6, X17 // z_lo[3] = x[3] * y
|
||||
MULHU X17, X6, X18 // z_hi[3] = x[3] * m
|
||||
MUL X17, X6, X17 // z_lo[3] = x[3] * m
|
||||
ADD X17, X29, X19 // z[3] = z_lo[3] + c
|
||||
SLTU X17, X19, X23
|
||||
ADD X23, X18, X29 // next c
|
||||
@ -356,8 +356,8 @@ loop4:
|
||||
loop1:
|
||||
MOV 0(X5), X10 // x
|
||||
|
||||
MULHU X10, X6, X12 // z_hi = x * y
|
||||
MUL X10, X6, X10 // z_lo = x * y
|
||||
MULHU X10, X6, X12 // z_hi = x * m
|
||||
MUL X10, X6, X10 // z_lo = x * m
|
||||
ADD X10, X29, X13 // z_lo + c
|
||||
SLTU X10, X13, X15
|
||||
ADD X12, X15, X29 // next c
|
||||
@ -374,97 +374,100 @@ done:
|
||||
MOV X29, c+64(FP) // return c
|
||||
RET
|
||||
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
MOV x+24(FP), X5
|
||||
MOV y+48(FP), X6
|
||||
MOV z+0(FP), X7
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
MOV y+48(FP), X5
|
||||
MOV m+72(FP), X6
|
||||
MOV x+24(FP), X7
|
||||
MOV z+0(FP), X20
|
||||
MOV z_len+8(FP), X30
|
||||
|
||||
MOV $4, X28
|
||||
MOV $0, X29 // c = 0
|
||||
MOV a+80(FP), X29 // c = a
|
||||
|
||||
BEQZ X30, done
|
||||
BLTU X30, X28, loop1
|
||||
|
||||
loop4:
|
||||
MOV 0(X5), X8 // x[0]
|
||||
MOV 0(X7), X10 // z[0]
|
||||
MOV 8(X5), X11 // x[1]
|
||||
MOV 8(X7), X13 // z[1]
|
||||
MOV 16(X5), X14 // x[2]
|
||||
MOV 16(X7), X16 // z[2]
|
||||
MOV 24(X5), X17 // x[3]
|
||||
MOV 24(X7), X19 // z[3]
|
||||
MOV 0(X5), X8 // y[0]
|
||||
MOV 0(X7), X10 // x[0]
|
||||
MOV 8(X5), X11 // y[1]
|
||||
MOV 8(X7), X13 // x[1]
|
||||
MOV 16(X5), X14 // y[2]
|
||||
MOV 16(X7), X16 // x[2]
|
||||
MOV 24(X5), X17 // y[3]
|
||||
MOV 24(X7), X19 // x[3]
|
||||
|
||||
MULHU X8, X6, X9 // z_hi[0] = x[0] * y
|
||||
MUL X8, X6, X8 // z_lo[0] = x[0] * y
|
||||
ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0]
|
||||
MULHU X8, X6, X9 // x_hi[0] = y[0] * m
|
||||
MUL X8, X6, X8 // x_lo[0] = y[0] * m
|
||||
ADD X8, X10, X21 // x_lo[0] = y[0] * m + x[0]
|
||||
SLTU X8, X21, X22
|
||||
ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0]
|
||||
ADD X21, X29, X10 // z[0] = x[0] * y + z[0] + c
|
||||
ADD X9, X22, X9 // x_hi[0] = y[0] * m + x[0]
|
||||
ADD X21, X29, X10 // x[0] = y[0] * m + x[0] + c
|
||||
SLTU X21, X10, X22
|
||||
ADD X9, X22, X29 // next c
|
||||
|
||||
MULHU X11, X6, X12 // z_hi[1] = x[1] * y
|
||||
MUL X11, X6, X11 // z_lo[1] = x[1] * y
|
||||
ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1]
|
||||
MULHU X11, X6, X12 // x_hi[1] = y[1] * m
|
||||
MUL X11, X6, X11 // x_lo[1] = y[1] * m
|
||||
ADD X11, X13, X21 // x_lo[1] = y[1] * m + x[1]
|
||||
SLTU X11, X21, X22
|
||||
ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1]
|
||||
ADD X21, X29, X13 // z[1] = x[1] * y + z[1] + c
|
||||
ADD X12, X22, X12 // x_hi[1] = y[1] * m + x[1]
|
||||
ADD X21, X29, X13 // x[1] = y[1] * m + x[1] + c
|
||||
SLTU X21, X13, X22
|
||||
ADD X12, X22, X29 // next c
|
||||
|
||||
MULHU X14, X6, X15 // z_hi[2] = x[2] * y
|
||||
MUL X14, X6, X14 // z_lo[2] = x[2] * y
|
||||
ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2]
|
||||
MULHU X14, X6, X15 // x_hi[2] = y[2] * m
|
||||
MUL X14, X6, X14 // x_lo[2] = y[2] * m
|
||||
ADD X14, X16, X21 // x_lo[2] = y[2] * m + x[2]
|
||||
SLTU X14, X21, X22
|
||||
ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2]
|
||||
ADD X21, X29, X16 // z[2] = x[2] * y + z[2] + c
|
||||
ADD X15, X22, X15 // x_hi[2] = y[2] * m + x[2]
|
||||
ADD X21, X29, X16 // x[2] = y[2] * m + x[2] + c
|
||||
SLTU X21, X16, X22
|
||||
ADD X15, X22, X29 // next c
|
||||
|
||||
MULHU X17, X6, X18 // z_hi[3] = x[3] * y
|
||||
MUL X17, X6, X17 // z_lo[3] = x[3] * y
|
||||
ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3]
|
||||
MULHU X17, X6, X18 // x_hi[3] = y[3] * m
|
||||
MUL X17, X6, X17 // x_lo[3] = y[3] * m
|
||||
ADD X17, X19, X21 // x_lo[3] = y[3] * m + x[3]
|
||||
SLTU X17, X21, X22
|
||||
ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3]
|
||||
ADD X21, X29, X19 // z[3] = x[3] * y + z[3] + c
|
||||
ADD X18, X22, X18 // x_hi[3] = y[3] * m + x[3]
|
||||
ADD X21, X29, X19 // x[3] = y[3] * m + x[3] + c
|
||||
SLTU X21, X19, X22
|
||||
ADD X18, X22, X29 // next c
|
||||
|
||||
MOV X10, 0(X7) // z[0]
|
||||
MOV X13, 8(X7) // z[1]
|
||||
MOV X16, 16(X7) // z[2]
|
||||
MOV X19, 24(X7) // z[3]
|
||||
MOV X10, 0(X20) // z[0]
|
||||
MOV X13, 8(X20) // z[1]
|
||||
MOV X16, 16(X20) // z[2]
|
||||
MOV X19, 24(X20) // z[3]
|
||||
|
||||
ADD $32, X5
|
||||
ADD $32, X7
|
||||
ADD $32, X20
|
||||
SUB $4, X30
|
||||
|
||||
BGEU X30, X28, loop4
|
||||
BEQZ X30, done
|
||||
|
||||
loop1:
|
||||
MOV 0(X5), X10 // x
|
||||
MOV 0(X7), X11 // z
|
||||
MOV 0(X5), X10 // y
|
||||
MOV 0(X7), X11 // x
|
||||
|
||||
MULHU X10, X6, X12 // z_hi = x * y
|
||||
MUL X10, X6, X10 // z_lo = x * y
|
||||
ADD X10, X11, X13 // z_lo = x * y + z
|
||||
MULHU X10, X6, X12 // z_hi = y * m
|
||||
MUL X10, X6, X10 // z_lo = y * m
|
||||
ADD X10, X11, X13 // z_lo = y * m + x
|
||||
SLTU X10, X13, X15
|
||||
ADD X12, X15, X12 // z_hi = x * y + z
|
||||
ADD X13, X29, X10 // z = x * y + z + c
|
||||
ADD X12, X15, X12 // z_hi = y * m + x
|
||||
ADD X13, X29, X10 // z = y * m + x + c
|
||||
SLTU X13, X10, X15
|
||||
ADD X12, X15, X29 // next c
|
||||
|
||||
MOV X10, 0(X7) // z
|
||||
MOV X10, 0(X20) // z
|
||||
|
||||
ADD $8, X5
|
||||
ADD $8, X7
|
||||
ADD $8, X20
|
||||
SUB $1, X30
|
||||
|
||||
BNEZ X30, loop1
|
||||
|
||||
done:
|
||||
MOV X29, c+56(FP) // return c
|
||||
MOV X29, c+88(FP) // return c
|
||||
RET
|
||||
|
@ -691,12 +691,12 @@ TEXT ·shrVU(SB), NOSPLIT, $0
|
||||
BR ·shrVU_g(SB)
|
||||
|
||||
// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
|
||||
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
||||
TEXT ·mulAddVWW(SB), NOSPLIT, $0
|
||||
MOVD z+0(FP), R2
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R9
|
||||
MOVD r+56(FP), R4 // c = r
|
||||
MOVD m+48(FP), R9
|
||||
MOVD a+56(FP), R4 // c = a
|
||||
MOVD z_len+8(FP), R5
|
||||
MOVD $0, R1 // i = 0
|
||||
MOVD $0, R7 // i*8 = 0
|
||||
@ -719,18 +719,19 @@ E5:
|
||||
MOVD R4, c+64(FP)
|
||||
RET
|
||||
|
||||
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
||||
// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
|
||||
TEXT ·addMulVVW(SB), NOSPLIT, $0
|
||||
MOVD z+0(FP), R2
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R9
|
||||
TEXT ·addMulVVWW(SB), NOSPLIT, $0
|
||||
MOVD z+0(FP), R3
|
||||
MOVD x+24(FP), R2
|
||||
MOVD y+48(FP), R8
|
||||
MOVD m+72(FP), R9
|
||||
MOVD z_len+8(FP), R5
|
||||
|
||||
MOVD $0, R1 // i*8 = 0
|
||||
MOVD $0, R7 // i = 0
|
||||
MOVD $0, R0 // make sure it's zero
|
||||
MOVD $0, R4 // c = 0
|
||||
MOVD a+80(FP), R4 // c = 0
|
||||
|
||||
MOVD R5, R12
|
||||
AND $-2, R12
|
||||
@ -746,7 +747,7 @@ A6:
|
||||
ADDC R4, R11
|
||||
ADDE R0, R6
|
||||
MOVD R6, R4
|
||||
MOVD R11, (R2)(R1*1)
|
||||
MOVD R11, (R3)(R1*1)
|
||||
|
||||
MOVD (8)(R8)(R1*1), R6
|
||||
MULHDU R9, R6
|
||||
@ -756,7 +757,7 @@ A6:
|
||||
ADDC R4, R11
|
||||
ADDE R0, R6
|
||||
MOVD R6, R4
|
||||
MOVD R11, (8)(R2)(R1*1)
|
||||
MOVD R11, (8)(R3)(R1*1)
|
||||
|
||||
ADD $16, R1 // i*8 + 8
|
||||
ADD $2, R7 // i++
|
||||
@ -773,7 +774,7 @@ L6:
|
||||
ADDC R4, R11
|
||||
ADDE R0, R6
|
||||
MOVD R6, R4
|
||||
MOVD R11, (R2)(R1*1)
|
||||
MOVD R11, (R3)(R1*1)
|
||||
|
||||
ADD $8, R1 // i*8 + 8
|
||||
ADD $1, R7 // i++
|
||||
@ -781,6 +782,6 @@ L6:
|
||||
E6:
|
||||
CMPBLT R7, R5, L6 // i < n
|
||||
|
||||
MOVD R4, c+56(FP)
|
||||
MOVD R4, c+88(FP)
|
||||
RET
|
||||
|
||||
|
@ -629,7 +629,7 @@ func BenchmarkMulAddVWW(b *testing.B) {
|
||||
if isRaceBuilder && n > 1e3 {
|
||||
continue
|
||||
}
|
||||
z := make([]Word, n+1)
|
||||
z := make([]Word, n)
|
||||
x := rndV(n)
|
||||
y := rndW()
|
||||
r := rndW()
|
||||
@ -642,18 +642,20 @@ func BenchmarkMulAddVWW(b *testing.B) {
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAddMulVVW(b *testing.B) {
|
||||
func BenchmarkAddMulVVWW(b *testing.B) {
|
||||
for _, n := range benchSizes {
|
||||
if isRaceBuilder && n > 1e3 {
|
||||
continue
|
||||
}
|
||||
x := rndV(n)
|
||||
y := rndW()
|
||||
z := make([]Word, n)
|
||||
x := rndV(n)
|
||||
y := rndV(n)
|
||||
m := rndW()
|
||||
a := rndW()
|
||||
b.Run(fmt.Sprint(n), func(b *testing.B) {
|
||||
b.SetBytes(int64(n * _W))
|
||||
for i := 0; i < b.N; i++ {
|
||||
addMulVVW(z, x, y)
|
||||
addMulVVWW(z, x, y, m, a)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
@ -27,6 +27,6 @@ TEXT ·shrVU(SB),NOSPLIT,$0
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
JMP ·mulAddVWW_g(SB)
|
||||
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVW_g(SB)
|
||||
TEXT ·addMulVVWW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVWW_g(SB)
|
||||
|
||||
|
@ -197,9 +197,9 @@ func (z nat) montgomery(x, y, m nat, k Word, n int) nat {
|
||||
var c Word
|
||||
for i := 0; i < n; i++ {
|
||||
d := y[i]
|
||||
c2 := addMulVVW(z[i:n+i], x, d)
|
||||
c2 := addMulVVWW(z[i:n+i], z[i:n+i], x, d, 0)
|
||||
t := z[i] * k
|
||||
c3 := addMulVVW(z[i:n+i], m, t)
|
||||
c3 := addMulVVWW(z[i:n+i], z[i:n+i], m, t, 0)
|
||||
cx := c + c2
|
||||
cy := cx + c3
|
||||
z[n+i] = cy
|
||||
|
@ -126,7 +126,7 @@ func basicSqr(stk *stack, z, x nat) {
|
||||
// z collects the squares x[i] * x[i]
|
||||
z[2*i+1], z[2*i] = mulWW(d, d)
|
||||
// t collects the products x[i] * x[j] where j < i
|
||||
t[2*i] = addMulVVW(t[i:2*i], x[0:i], d)
|
||||
t[2*i] = addMulVVWW(t[i:2*i], t[i:2*i], x[0:i], d, 0)
|
||||
}
|
||||
t[2*n-1] = shlVU(t[1:2*n-1], t[1:2*n-1], 1) // double the j < i products
|
||||
addVV(z, z, t) // combine the result
|
||||
@ -152,7 +152,7 @@ func basicMul(z, x, y nat) {
|
||||
clear(z[0 : len(x)+len(y)]) // initialize z
|
||||
for i, d := range y {
|
||||
if d != 0 {
|
||||
z[len(x)+i] = addMulVVW(z[i:i+len(x)], x, d)
|
||||
z[len(x)+i] = addMulVVWW(z[i:i+len(x)], z[i:i+len(x)], x, d, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user