math/big: rewrite subVW to use fast path on s390x

This CL replaces the original subVW implementation with a implementation that uses a similar idea as CL 164968. When we know the borrow bit is zero, we can copy the rest of words as they will not be updated. Also, since we are copying vector of a words, a faster implementation of copy is written in this CL to copy a word or multiple words at a time. Benchmarks: name old time/op new time/op delta SubVW/1-18 4.43ns ± 0% 3.82ns ± 0% -13.85% (p=0.000 n=20+20) SubVW/2-18 5.39ns ± 0% 4.25ns ± 0% -21.23% (p=0.000 n=20+20) SubVW/3-18 6.29ns ± 0% 4.65ns ± 0% -26.07% (p=0.000 n=16+19) SubVW/4-18 6.08ns ± 2% 4.84ns ± 0% -20.43% (p=0.000 n=20+20) SubVW/5-18 7.06ns ± 1% 4.93ns ± 0% -30.18% (p=0.000 n=20+20) SubVW/10-18 10.3ns ± 2% 7.2ns ± 0% -30.35% (p=0.000 n=20+19) SubVW/100-18 48.0ns ± 4% 17.6ns ± 0% -63.32% (p=0.000 n=18+19) SubVW/1000-18 448ns ±10% 236ns ± 1% -47.24% (p=0.000 n=20+20) SubVW/10000-18 4.83µs ± 5% 2.96µs ± 0% -38.73% (p=0.000 n=20+19) SubVW/100000-18 46.6µs ± 3% 30.6µs ± 1% -34.30% (p=0.000 n=20+20) [Geo mean] 56.3ns 37.0ns -34.24% name old speed new speed delta SubVW/1-18 1.80GB/s ± 0% 2.10GB/s ± 0% +16.16% (p=0.000 n=20+20) SubVW/2-18 2.97GB/s ± 0% 3.77GB/s ± 0% +26.95% (p=0.000 n=20+20) SubVW/3-18 3.82GB/s ± 0% 5.16GB/s ± 0% +35.26% (p=0.000 n=20+19) SubVW/4-18 5.26GB/s ± 1% 6.61GB/s ± 0% +25.59% (p=0.000 n=20+20) SubVW/5-18 5.67GB/s ± 1% 8.11GB/s ± 0% +43.12% (p=0.000 n=20+20) SubVW/10-18 7.79GB/s ± 2% 11.17GB/s ± 0% +43.52% (p=0.000 n=20+19) SubVW/100-18 16.7GB/s ± 4% 45.5GB/s ± 0% +172.61% (p=0.000 n=18+20) SubVW/1000-18 17.9GB/s ± 9% 33.9GB/s ± 1% +89.25% (p=0.000 n=20+20) SubVW/10000-18 16.6GB/s ± 5% 27.0GB/s ± 0% +63.08% (p=0.000 n=20+19) SubVW/100000-18 17.2GB/s ± 2% 26.1GB/s ± 1% +52.18% (p=0.000 n=20+20) [Geo mean] 7.25GB/s 11.03GB/s +52.01% Change-Id: I32e99cbab3260054a96231d02b87049c833ab77e Reviewed-on: https://go-review.googlesource.com/c/go/+/227297 Reviewed-by: Michael Munday <mike.munday@ibm.com> Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2025-05-16 04:44:39 +00:00 · 2020-03-24 11:51:22 -04:00 · 2020-03-24 11:51:22 -04:00 · 3a37fd4010
commit 3a37fd4010
parent 45f1ee3d5f
3 changed files with 73 additions and 210 deletions
--- a/src/math/big/arith_decl_s390x.go
+++ b/src/math/big/arith_decl_s390x.go
@ -12,9 +12,6 @@ func addVV_novec(z, x, y []Word) (c Word)
 func subVV_check(z, x, y []Word) (c Word)
 func subVV_vec(z, x, y []Word) (c Word)
 func subVV_novec(z, x, y []Word) (c Word)
 func subVW_check(z, x []Word, y Word) (c Word)
 func subVW_vec(z, x []Word, y Word) (c Word)
 func subVW_novec(z, x []Word, y Word) (c Word)
 func hasVectorFacility() bool
 var hasVX = hasVectorFacility()
--- a/src/math/big/arith_s390x.s
+++ b/src/math/big/arith_s390x.s
@ -631,220 +631,95 @@ returnC:
 	RET
 TEXT ·subVW(SB), NOSPLIT, $0
-	MOVD subwvectorfacility+0x00(SB), R1
+	MOVD z_len+8(FP), R5
-	BR   (R1)
+	MOVD x+24(FP), R6
 	MOVD y+48(FP), R7    // The borrow bit passed in
 	MOVD z+0(FP), R8
 	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
-TEXT ·subVW_check(SB), NOSPLIT, $0
+	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
 	MOVB   ·hasVX(SB), R1
 	CMPBEQ R1, $1, vectorimpl               // vectorfacility = 1, vector supported
 	MOVD   $subwvectorfacility+0x00(SB), R1
 	MOVD   $·subVW_novec(SB), R2
 	MOVD   R2, 0(R1)
-	// MOVD	$·subVW_novec(SB), 0(R1)
+	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
-	BR ·subVW_novec(SB)
+	MOVD   0(R6), R9
 	SUBC   R7, R9
 	MOVD   R9, 0(R8)
 	CMPBEQ R5, $1, returnResult
 	MOVD   8(R6), R9
 	SUBE   R0, R9
 	MOVD   R9, 8(R8)
 	CMPBEQ R5, $2, returnResult
-vectorimpl:
+	// Update the counters
-	MOVD $subwvectorfacility+0x00(SB), R1
+	MOVD $16, R12    // i = 2
-	MOVD $·subVW_vec(SB), R2
+	MOVD $-2(R5), R5 // n = n - 2
 	MOVD R2, 0(R1)
-	// MOVD	$·subVW_vec(SB), 0(R1)
+loopOverEachWord:
-	BR ·subVW_vec(SB)
+	BRC  $3, copySetup    // no borrow, copy the rest
 	MOVD 0(R6)(R12*1), R9
-GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
+	// Originally we used the borrow flag generated in the previous iteration
-DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
+	// (i.e: SUBE could be used here to do the subtraction). However, since we
 	// already know borrow is 1 (otherwise we will go to copy section), we can
 	// use SUBC here so the current iteration does not depend on the borrow flag
 	// generated in the previous iteration. This could be useful when branch prediction happens.
 	SUBC $1, R9
 	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
-// func subVW(z, x []Word, y Word) (c Word)
+	MOVD  $8(R12), R12         // i++
-TEXT ·subVW_vec(SB), NOSPLIT, $0
+	BRCTG R5, loopOverEachWord // n--
 	MOVD z_len+8(FP), R3
 	MOVD x+24(FP), R8
 	MOVD y+48(FP), R4    // c = y
 	MOVD z+0(FP), R2
 	MOVD $0, R0  // make sure it's zero
 	MOVD $0, R10 // i = 0
 	MOVD R8, R5
 	MOVD R2, R7
 	// s/JL/JMP/ below to disable the unrolled loop
 	SUB $4, R3  // n -= 4
 	BLT v11     // if n < 0 goto v11
 	SUB $12, R3
 	BLT A11
 	VZERO V0
 	MOVD  $1, R6     // prepare V0 to be final carry register
 	VLVGG $1, R6, V0 // borrow is initially "no borrow"
 	VZERO V9         // to ensure upper half is zero
 	VLVGG $1, R4, V9
 	// n >= 0
 	// regular loop body unrolled 16x
 UU1:
 	VLM  0(R5), V1, V4    // 64-bytes into V1..V4
 	ADD  $64, R5
 	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
 	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
 	VSBCBIQ V1, V9, V0, V25
 	VSBIQ   V1, V9, V0, V17
 	VZERO   V9
 	VSBCBIQ V2, V9, V25, V26
 	VSBIQ   V2, V9, V25, V18
 	VLM 0(R5), V5, V6 // 32-bytes into V5..V6
 	ADD $32, R5
 	VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
 	VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
 	VSBCBIQ V3, V9, V26, V27
 	VSBIQ   V3, V9, V26, V19
 	VSBCBIQ V4, V9, V27, V28
 	VSBIQ   V4, V9, V27, V20
 	VLM 0(R5), V7, V8 // 32-bytes into V7..V8
 	ADD $32, R5
 	VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
 	VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
 	VSBCBIQ V5, V9, V28, V29
 	VSBIQ   V5, V9, V28, V21
 	VSBCBIQ V6, V9, V29, V30
 	VSBIQ   V6, V9, V29, V22
 	VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
 	VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
 	VSBCBIQ V7, V9, V30, V31
 	VSBIQ   V7, V9, V30, V23
 	VSBCBIQ V8, V9, V31, V0  // V0 has carry-over
 	VSBIQ   V8, V9, V31, V24
 	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
 	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
 	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
 	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
 	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
 	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
 	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
 	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
 	VSTM  V17, V24, 0(R7)     // 128-bytes into z
 	ADD   $128, R7
 	ADD   $128, R10           // i += 16
 	SUB   $16, R3             // n -= 16
 	BGE   UU1                 // if n >= 0 goto U1
 	VLGVG $1, V0, R4          // put cf into R4 in case we branch to v10
 	SUB   $1, R4              // save cf
 	NEG   R4, R4
 A11:
 	ADD $12, R3 // n += 16
 	BLT v11 // if n < 0 goto v11
 	// n >= 0
 	// regular loop body unrolled 4x
 U4:  // n >= 0
 	// regular loop body unrolled 4x
 	MOVD 0(R8)(R10*1), R5
 	MOVD 8(R8)(R10*1), R6
 	MOVD 16(R8)(R10*1), R7
 	MOVD 24(R8)(R10*1), R1
 	SUBC R4, R5            // SLGR  -> SUBC
 	SUBE R0, R6            // SLBGR -> SUBE
 	SUBE R0, R7
 	SUBE R0, R1
 	SUBE R4, R4            // save CF
 	NEG  R4, R4
 	MOVD R5, 0(R2)(R10*1)
 	MOVD R6, 8(R2)(R10*1)
 	MOVD R7, 16(R2)(R10*1)
 	MOVD R1, 24(R2)(R10*1)
 	ADD $32, R10 // i += 4 -> i +=32
 	SUB $4, R3   // n -= 4
 	BGE U4       // if n >= 0 goto U4
 v11:
 	ADD $4, R3 // n += 4
 	BLE E11    // if n <= 0 goto E4
 L4:  // n > 0
 	MOVD 0(R8)(R10*1), R5
 	SUBC R4, R5
 	SUBE R4, R4           // save CF
 	NEG  R4, R4
 	MOVD R5, 0(R2)(R10*1)
 	ADD $8, R10 // i++
 	SUB $1, R3  // n--
 	BGT L4      // if n > 0 goto L4
 E11:
 	MOVD R4, c+56(FP) // return c
 // return the current borrow value
 returnResult:
 	SUBE R0, R0
 	NEG  R0, R0
 	MOVD R0, c+56(FP)
 	RET
-// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
+// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
-// func subVW(z, x []Word, y Word) (c Word)
+// With the assumption that x and z will not overlap with each other or x and z will
-// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
+// point to same memory region, we can use a faster version of copy using only MVC here.
-TEXT ·subVW_novec(SB), NOSPLIT, $0
+// In the following implementation, we have three copy loops, each copying a word, 4 words, and
-	MOVD z_len+8(FP), R3
+// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
-	MOVD x+24(FP), R8
+copySetup:
-	MOVD y+48(FP), R4    // c = y
+	ADD R12, R6
-	MOVD z+0(FP), R2
+	ADD R12, R8
 	MOVD $0, R0          // make sure it's 0
 	MOVD $0, R10         // i = 0
-	// s/JL/JMP/ below to disable the unrolled loop
+	CMPBGE R5, $4, mediumLoop
 	SUB $4, R3 // n -= 4
 	BLT v4     // if n < 4 goto v4
-U4:  // n >= 0
+smallLoop:  // does a loop unrolling to copy word when n < 4
-	// regular loop body unrolled 4x
+	CMPBEQ R5, $0, returnZero
-	MOVD 0(R8)(R10*1), R5
+	MVC    $8, 0(R6), 0(R8)
-	MOVD 8(R8)(R10*1), R6
+	CMPBEQ R5, $1, returnZero
-	MOVD 16(R8)(R10*1), R7
+	MVC    $8, 8(R6), 8(R8)
-	MOVD 24(R8)(R10*1), R1
+	CMPBEQ R5, $2, returnZero
-	SUBC R4, R5            // SLGR  -> SUBC
+	MVC    $8, 16(R6), 16(R8)
 	SUBE R0, R6            // SLBGR -> SUBE
 	SUBE R0, R7
 	SUBE R0, R1
 	SUBE R4, R4            // save CF
 	NEG  R4, R4
 	MOVD R5, 0(R2)(R10*1)
 	MOVD R6, 8(R2)(R10*1)
 	MOVD R7, 16(R2)(R10*1)
 	MOVD R1, 24(R2)(R10*1)
-	ADD $32, R10 // i += 4 -> i +=32
+returnZero:
-	SUB $4, R3   // n -= 4
+	MOVD $0, c+56(FP) // return 0 as borrow
-	BGE U4       // if n >= 0 goto U4
+	RET
-v4:
+mediumLoop:
-	ADD $4, R3 // n += 4
+	CMPBLT R5, $4, smallLoop
-	BLE E4     // if n <= 0 goto E4
+	CMPBLT R5, $32, mediumLoopBody
-L4:  // n > 0
+largeLoop:  // Copying 256 bytes at a time
-	MOVD 0(R8)(R10*1), R5
+	MVC    $256, 0(R6), 0(R8)
-	SUBC R4, R5
+	MOVD   $256(R6), R6
-	SUBE R4, R4           // save CF
+	MOVD   $256(R8), R8
-	NEG  R4, R4
+	MOVD   $-32(R5), R5
-	MOVD R5, 0(R2)(R10*1)
+	CMPBGE R5, $32, largeLoop
 	BR     mediumLoop
-	ADD $8, R10 // i++
+mediumLoopBody:  // Copying 32 bytes at a time
-	SUB $1, R3  // n--
+	MVC    $32, 0(R6), 0(R8)
-	BGT L4      // if n > 0 goto L4
+	MOVD   $32(R6), R6
-
+	MOVD   $32(R8), R8
-E4:
+	MOVD   $-4(R5), R5
-	MOVD R4, c+56(FP) // return c
+	CMPBGE R5, $4, mediumLoopBody
 	BR     smallLoop
 returnC:
 	MOVD R7, c+56(FP)
 	RET
 // func shlVU(z, x []Word, s uint) (c Word)
--- a/src/math/big/arith_s390x_test.go
+++ b/src/math/big/arith_s390x_test.go
@ -30,12 +30,3 @@ func TestFunVVnovec(t *testing.T) {
 		}
 	}
 }
 func TestFunVWnovec(t *testing.T) {
 	if hasVX == true {
 		for _, a := range sumVW {
 			arg := argVW{a.x, a.z, a.y, a.c}
 			testFunVW(t, "subVW_novec", subVW_novec, arg)
 		}
 	}
 }