diff --git a/src/math/big/arith_decl_s390x.go b/src/math/big/arith_decl_s390x.go index 38fefcc7b0..5973d3cfc1 100644 --- a/src/math/big/arith_decl_s390x.go +++ b/src/math/big/arith_decl_s390x.go @@ -12,9 +12,6 @@ func addVV_novec(z, x, y []Word) (c Word) func subVV_check(z, x, y []Word) (c Word) func subVV_vec(z, x, y []Word) (c Word) func subVV_novec(z, x, y []Word) (c Word) -func subVW_check(z, x []Word, y Word) (c Word) -func subVW_vec(z, x []Word, y Word) (c Word) -func subVW_novec(z, x []Word, y Word) (c Word) func hasVectorFacility() bool var hasVX = hasVectorFacility() diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s index f0464df201..ef0192224f 100644 --- a/src/math/big/arith_s390x.s +++ b/src/math/big/arith_s390x.s @@ -631,220 +631,95 @@ returnC: RET TEXT ·subVW(SB), NOSPLIT, $0 - MOVD subwvectorfacility+0x00(SB), R1 - BR (R1) + MOVD z_len+8(FP), R5 + MOVD x+24(FP), R6 + MOVD y+48(FP), R7 // The borrow bit passed in + MOVD z+0(FP), R8 + MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it. -TEXT ·subVW_check(SB), NOSPLIT, $0 - MOVB ·hasVX(SB), R1 - CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported - MOVD $subwvectorfacility+0x00(SB), R1 - MOVD $·subVW_novec(SB), R2 - MOVD R2, 0(R1) + CMPBEQ R5, $0, returnC // len(z) == 0, have an early return - // MOVD $·subVW_novec(SB), 0(R1) - BR ·subVW_novec(SB) + // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag + MOVD 0(R6), R9 + SUBC R7, R9 + MOVD R9, 0(R8) + CMPBEQ R5, $1, returnResult + MOVD 8(R6), R9 + SUBE R0, R9 + MOVD R9, 8(R8) + CMPBEQ R5, $2, returnResult -vectorimpl: - MOVD $subwvectorfacility+0x00(SB), R1 - MOVD $·subVW_vec(SB), R2 - MOVD R2, 0(R1) + // Update the counters + MOVD $16, R12 // i = 2 + MOVD $-2(R5), R5 // n = n - 2 - // MOVD $·subVW_vec(SB), 0(R1) - BR ·subVW_vec(SB) +loopOverEachWord: + BRC $3, copySetup // no borrow, copy the rest + MOVD 0(R6)(R12*1), R9 -GLOBL subwvectorfacility+0x00(SB), NOPTR, $8 -DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB) + // Originally we used the borrow flag generated in the previous iteration + // (i.e: SUBE could be used here to do the subtraction). However, since we + // already know borrow is 1 (otherwise we will go to copy section), we can + // use SUBC here so the current iteration does not depend on the borrow flag + // generated in the previous iteration. This could be useful when branch prediction happens. + SUBC $1, R9 + MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1 -// func subVW(z, x []Word, y Word) (c Word) -TEXT ·subVW_vec(SB), NOSPLIT, $0 - MOVD z_len+8(FP), R3 - MOVD x+24(FP), R8 - MOVD y+48(FP), R4 // c = y - MOVD z+0(FP), R2 - - MOVD $0, R0 // make sure it's zero - MOVD $0, R10 // i = 0 - MOVD R8, R5 - MOVD R2, R7 - - // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v11 // if n < 0 goto v11 - SUB $12, R3 - BLT A11 - - VZERO V0 - MOVD $1, R6 // prepare V0 to be final carry register - VLVGG $1, R6, V0 // borrow is initially "no borrow" - VZERO V9 // to ensure upper half is zero - VLVGG $1, R4, V9 - - // n >= 0 - // regular loop body unrolled 16x - -UU1: - VLM 0(R5), V1, V4 // 64-bytes into V1..V4 - ADD $64, R5 - VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order - VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order - - VSBCBIQ V1, V9, V0, V25 - VSBIQ V1, V9, V0, V17 - VZERO V9 - VSBCBIQ V2, V9, V25, V26 - VSBIQ V2, V9, V25, V18 - - VLM 0(R5), V5, V6 // 32-bytes into V5..V6 - ADD $32, R5 - - VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order - VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order - - VSBCBIQ V3, V9, V26, V27 - VSBIQ V3, V9, V26, V19 - VSBCBIQ V4, V9, V27, V28 - VSBIQ V4, V9, V27, V20 - - VLM 0(R5), V7, V8 // 32-bytes into V7..V8 - ADD $32, R5 - - VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order - VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order - - VSBCBIQ V5, V9, V28, V29 - VSBIQ V5, V9, V28, V21 - VSBCBIQ V6, V9, V29, V30 - VSBIQ V6, V9, V29, V22 - - VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order - VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order - - VSBCBIQ V7, V9, V30, V31 - VSBIQ V7, V9, V30, V23 - VSBCBIQ V8, V9, V31, V0 // V0 has carry-over - VSBIQ V8, V9, V31, V24 - - VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order - VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order - VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order - VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order - VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order - VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order - VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order - VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order - VSTM V17, V24, 0(R7) // 128-bytes into z - ADD $128, R7 - ADD $128, R10 // i += 16 - SUB $16, R3 // n -= 16 - BGE UU1 // if n >= 0 goto U1 - VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 - SUB $1, R4 // save cf - NEG R4, R4 - -A11: - ADD $12, R3 // n += 16 - - BLT v11 // if n < 0 goto v11 - - // n >= 0 - // regular loop body unrolled 4x - -U4: // n >= 0 - // regular loop body unrolled 4x - MOVD 0(R8)(R10*1), R5 - MOVD 8(R8)(R10*1), R6 - MOVD 16(R8)(R10*1), R7 - MOVD 24(R8)(R10*1), R1 - SUBC R4, R5 // SLGR -> SUBC - SUBE R0, R6 // SLBGR -> SUBE - SUBE R0, R7 - SUBE R0, R1 - SUBE R4, R4 // save CF - NEG R4, R4 - MOVD R5, 0(R2)(R10*1) - MOVD R6, 8(R2)(R10*1) - MOVD R7, 16(R2)(R10*1) - MOVD R1, 24(R2)(R10*1) - - ADD $32, R10 // i += 4 -> i +=32 - SUB $4, R3 // n -= 4 - BGE U4 // if n >= 0 goto U4 - -v11: - ADD $4, R3 // n += 4 - BLE E11 // if n <= 0 goto E4 - -L4: // n > 0 - - MOVD 0(R8)(R10*1), R5 - SUBC R4, R5 - SUBE R4, R4 // save CF - NEG R4, R4 - MOVD R5, 0(R2)(R10*1) - - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L4 // if n > 0 goto L4 - -E11: - MOVD R4, c+56(FP) // return c + MOVD $8(R12), R12 // i++ + BRCTG R5, loopOverEachWord // n-- +// return the current borrow value +returnResult: + SUBE R0, R0 + NEG R0, R0 + MOVD R0, c+56(FP) RET -// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) -// func subVW(z, x []Word, y Word) (c Word) -// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names) -TEXT ·subVW_novec(SB), NOSPLIT, $0 - MOVD z_len+8(FP), R3 - MOVD x+24(FP), R8 - MOVD y+48(FP), R4 // c = y - MOVD z+0(FP), R2 - MOVD $0, R0 // make sure it's 0 - MOVD $0, R10 // i = 0 +// Update position of x(R6) and z(R8) based on the current counter value and perform copying. +// With the assumption that x and z will not overlap with each other or x and z will +// point to same memory region, we can use a faster version of copy using only MVC here. +// In the following implementation, we have three copy loops, each copying a word, 4 words, and +// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. +copySetup: + ADD R12, R6 + ADD R12, R8 - // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v4 // if n < 4 goto v4 + CMPBGE R5, $4, mediumLoop -U4: // n >= 0 - // regular loop body unrolled 4x - MOVD 0(R8)(R10*1), R5 - MOVD 8(R8)(R10*1), R6 - MOVD 16(R8)(R10*1), R7 - MOVD 24(R8)(R10*1), R1 - SUBC R4, R5 // SLGR -> SUBC - SUBE R0, R6 // SLBGR -> SUBE - SUBE R0, R7 - SUBE R0, R1 - SUBE R4, R4 // save CF - NEG R4, R4 - MOVD R5, 0(R2)(R10*1) - MOVD R6, 8(R2)(R10*1) - MOVD R7, 16(R2)(R10*1) - MOVD R1, 24(R2)(R10*1) +smallLoop: // does a loop unrolling to copy word when n < 4 + CMPBEQ R5, $0, returnZero + MVC $8, 0(R6), 0(R8) + CMPBEQ R5, $1, returnZero + MVC $8, 8(R6), 8(R8) + CMPBEQ R5, $2, returnZero + MVC $8, 16(R6), 16(R8) - ADD $32, R10 // i += 4 -> i +=32 - SUB $4, R3 // n -= 4 - BGE U4 // if n >= 0 goto U4 +returnZero: + MOVD $0, c+56(FP) // return 0 as borrow + RET -v4: - ADD $4, R3 // n += 4 - BLE E4 // if n <= 0 goto E4 +mediumLoop: + CMPBLT R5, $4, smallLoop + CMPBLT R5, $32, mediumLoopBody -L4: // n > 0 - MOVD 0(R8)(R10*1), R5 - SUBC R4, R5 - SUBE R4, R4 // save CF - NEG R4, R4 - MOVD R5, 0(R2)(R10*1) +largeLoop: // Copying 256 bytes at a time + MVC $256, 0(R6), 0(R8) + MOVD $256(R6), R6 + MOVD $256(R8), R8 + MOVD $-32(R5), R5 + CMPBGE R5, $32, largeLoop + BR mediumLoop - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L4 // if n > 0 goto L4 - -E4: - MOVD R4, c+56(FP) // return c +mediumLoopBody: // Copying 32 bytes at a time + MVC $32, 0(R6), 0(R8) + MOVD $32(R6), R6 + MOVD $32(R8), R8 + MOVD $-4(R5), R5 + CMPBGE R5, $4, mediumLoopBody + BR smallLoop +returnC: + MOVD R7, c+56(FP) RET // func shlVU(z, x []Word, s uint) (c Word) diff --git a/src/math/big/arith_s390x_test.go b/src/math/big/arith_s390x_test.go index e7f1b57ea5..ce6bca8885 100644 --- a/src/math/big/arith_s390x_test.go +++ b/src/math/big/arith_s390x_test.go @@ -30,12 +30,3 @@ func TestFunVVnovec(t *testing.T) { } } } - -func TestFunVWnovec(t *testing.T) { - if hasVX == true { - for _, a := range sumVW { - arg := argVW{a.x, a.z, a.y, a.c} - testFunVW(t, "subVW_novec", subVW_novec, arg) - } - } -}