math/big: rewrite subVW to use fast path on s390x

This CL replaces the original subVW implementation with a implementation
that uses a similar idea as CL 164968.

When we know the borrow bit is zero, we can copy the rest of words as
they will not be updated. Also, since we are copying vector of a words,
a faster implementation of copy is written in this CL to copy a word or
multiple words at a time.

Benchmarks:
name             old time/op    new time/op     delta
SubVW/1-18         4.43ns ± 0%     3.82ns ± 0%   -13.85%  (p=0.000 n=20+20)
SubVW/2-18         5.39ns ± 0%     4.25ns ± 0%   -21.23%  (p=0.000 n=20+20)
SubVW/3-18         6.29ns ± 0%     4.65ns ± 0%   -26.07%  (p=0.000 n=16+19)
SubVW/4-18         6.08ns ± 2%     4.84ns ± 0%   -20.43%  (p=0.000 n=20+20)
SubVW/5-18         7.06ns ± 1%     4.93ns ± 0%   -30.18%  (p=0.000 n=20+20)
SubVW/10-18        10.3ns ± 2%      7.2ns ± 0%   -30.35%  (p=0.000 n=20+19)
SubVW/100-18       48.0ns ± 4%     17.6ns ± 0%   -63.32%  (p=0.000 n=18+19)
SubVW/1000-18       448ns ±10%      236ns ± 1%   -47.24%  (p=0.000 n=20+20)
SubVW/10000-18     4.83µs ± 5%     2.96µs ± 0%   -38.73%  (p=0.000 n=20+19)
SubVW/100000-18    46.6µs ± 3%     30.6µs ± 1%   -34.30%  (p=0.000 n=20+20)
[Geo mean]         56.3ns          37.0ns        -34.24%

name             old speed      new speed       delta
SubVW/1-18       1.80GB/s ± 0%   2.10GB/s ± 0%   +16.16%  (p=0.000 n=20+20)
SubVW/2-18       2.97GB/s ± 0%   3.77GB/s ± 0%   +26.95%  (p=0.000 n=20+20)
SubVW/3-18       3.82GB/s ± 0%   5.16GB/s ± 0%   +35.26%  (p=0.000 n=20+19)
SubVW/4-18       5.26GB/s ± 1%   6.61GB/s ± 0%   +25.59%  (p=0.000 n=20+20)
SubVW/5-18       5.67GB/s ± 1%   8.11GB/s ± 0%   +43.12%  (p=0.000 n=20+20)
SubVW/10-18      7.79GB/s ± 2%  11.17GB/s ± 0%   +43.52%  (p=0.000 n=20+19)
SubVW/100-18     16.7GB/s ± 4%   45.5GB/s ± 0%  +172.61%  (p=0.000 n=18+20)
SubVW/1000-18    17.9GB/s ± 9%   33.9GB/s ± 1%   +89.25%  (p=0.000 n=20+20)
SubVW/10000-18   16.6GB/s ± 5%   27.0GB/s ± 0%   +63.08%  (p=0.000 n=20+19)
SubVW/100000-18  17.2GB/s ± 2%   26.1GB/s ± 1%   +52.18%  (p=0.000 n=20+20)
[Geo mean]       7.25GB/s       11.03GB/s        +52.01%

Change-Id: I32e99cbab3260054a96231d02b87049c833ab77e
Reviewed-on: https://go-review.googlesource.com/c/go/+/227297
Reviewed-by: Michael Munday <mike.munday@ibm.com>
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
Ruixin(Peter) Bao 2020-03-24 11:51:22 -04:00 committed by Michael Munday
parent 45f1ee3d5f
commit 3a37fd4010
3 changed files with 73 additions and 210 deletions

View File

@ -12,9 +12,6 @@ func addVV_novec(z, x, y []Word) (c Word)
func subVV_check(z, x, y []Word) (c Word)
func subVV_vec(z, x, y []Word) (c Word)
func subVV_novec(z, x, y []Word) (c Word)
func subVW_check(z, x []Word, y Word) (c Word)
func subVW_vec(z, x []Word, y Word) (c Word)
func subVW_novec(z, x []Word, y Word) (c Word)
func hasVectorFacility() bool
var hasVX = hasVectorFacility()

View File

@ -631,220 +631,95 @@ returnC:
RET
TEXT ·subVW(SB), NOSPLIT, $0
MOVD subwvectorfacility+0x00(SB), R1
BR (R1)
MOVD z_len+8(FP), R5
MOVD x+24(FP), R6
MOVD y+48(FP), R7 // The borrow bit passed in
MOVD z+0(FP), R8
MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it.
TEXT ·subVW_check(SB), NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $subwvectorfacility+0x00(SB), R1
MOVD $·subVW_novec(SB), R2
MOVD R2, 0(R1)
CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
// MOVD $·subVW_novec(SB), 0(R1)
BR ·subVW_novec(SB)
// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
MOVD 0(R6), R9
SUBC R7, R9
MOVD R9, 0(R8)
CMPBEQ R5, $1, returnResult
MOVD 8(R6), R9
SUBE R0, R9
MOVD R9, 8(R8)
CMPBEQ R5, $2, returnResult
vectorimpl:
MOVD $subwvectorfacility+0x00(SB), R1
MOVD $·subVW_vec(SB), R2
MOVD R2, 0(R1)
// Update the counters
MOVD $16, R12 // i = 2
MOVD $-2(R5), R5 // n = n - 2
// MOVD $·subVW_vec(SB), 0(R1)
BR ·subVW_vec(SB)
loopOverEachWord:
BRC $3, copySetup // no borrow, copy the rest
MOVD 0(R6)(R12*1), R9
GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
// Originally we used the borrow flag generated in the previous iteration
// (i.e: SUBE could be used here to do the subtraction). However, since we
// already know borrow is 1 (otherwise we will go to copy section), we can
// use SUBC here so the current iteration does not depend on the borrow flag
// generated in the previous iteration. This could be useful when branch prediction happens.
SUBC $1, R9
MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW_vec(SB), NOSPLIT, $0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R4 // c = y
MOVD z+0(FP), R2
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
MOVD R8, R5
MOVD R2, R7
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v11 // if n < 0 goto v11
SUB $12, R3
BLT A11
VZERO V0
MOVD $1, R6 // prepare V0 to be final carry register
VLVGG $1, R6, V0 // borrow is initially "no borrow"
VZERO V9 // to ensure upper half is zero
VLVGG $1, R4, V9
// n >= 0
// regular loop body unrolled 16x
UU1:
VLM 0(R5), V1, V4 // 64-bytes into V1..V4
ADD $64, R5
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
VSBCBIQ V1, V9, V0, V25
VSBIQ V1, V9, V0, V17
VZERO V9
VSBCBIQ V2, V9, V25, V26
VSBIQ V2, V9, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V5..V6
ADD $32, R5
VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
VSBCBIQ V3, V9, V26, V27
VSBIQ V3, V9, V26, V19
VSBCBIQ V4, V9, V27, V28
VSBIQ V4, V9, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V7..V8
ADD $32, R5
VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
VSBCBIQ V5, V9, V28, V29
VSBIQ V5, V9, V28, V21
VSBCBIQ V6, V9, V29, V30
VSBIQ V6, V9, V29, V22
VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
VSBCBIQ V7, V9, V30, V31
VSBIQ V7, V9, V30, V23
VSBCBIQ V8, V9, V31, V0 // V0 has carry-over
VSBIQ V8, V9, V31, V24
VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
SUB $1, R4 // save cf
NEG R4, R4
A11:
ADD $12, R3 // n += 16
BLT v11 // if n < 0 goto v11
// n >= 0
// regular loop body unrolled 4x
U4: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
SUBC R4, R5 // SLGR -> SUBC
SUBE R0, R6 // SLBGR -> SUBE
SUBE R0, R7
SUBE R0, R1
SUBE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4 -> i +=32
SUB $4, R3 // n -= 4
BGE U4 // if n >= 0 goto U4
v11:
ADD $4, R3 // n += 4
BLE E11 // if n <= 0 goto E4
L4: // n > 0
MOVD 0(R8)(R10*1), R5
SUBC R4, R5
SUBE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L4 // if n > 0 goto L4
E11:
MOVD R4, c+56(FP) // return c
MOVD $8(R12), R12 // i++
BRCTG R5, loopOverEachWord // n--
// return the current borrow value
returnResult:
SUBE R0, R0
NEG R0, R0
MOVD R0, c+56(FP)
RET
// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
// func subVW(z, x []Word, y Word) (c Word)
// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
TEXT ·subVW_novec(SB), NOSPLIT, $0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R4 // c = y
MOVD z+0(FP), R2
MOVD $0, R0 // make sure it's 0
MOVD $0, R10 // i = 0
// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
// With the assumption that x and z will not overlap with each other or x and z will
// point to same memory region, we can use a faster version of copy using only MVC here.
// In the following implementation, we have three copy loops, each copying a word, 4 words, and
// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
copySetup:
ADD R12, R6
ADD R12, R8
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v4 // if n < 4 goto v4
CMPBGE R5, $4, mediumLoop
U4: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
SUBC R4, R5 // SLGR -> SUBC
SUBE R0, R6 // SLBGR -> SUBE
SUBE R0, R7
SUBE R0, R1
SUBE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
smallLoop: // does a loop unrolling to copy word when n < 4
CMPBEQ R5, $0, returnZero
MVC $8, 0(R6), 0(R8)
CMPBEQ R5, $1, returnZero
MVC $8, 8(R6), 8(R8)
CMPBEQ R5, $2, returnZero
MVC $8, 16(R6), 16(R8)
ADD $32, R10 // i += 4 -> i +=32
SUB $4, R3 // n -= 4
BGE U4 // if n >= 0 goto U4
returnZero:
MOVD $0, c+56(FP) // return 0 as borrow
RET
v4:
ADD $4, R3 // n += 4
BLE E4 // if n <= 0 goto E4
mediumLoop:
CMPBLT R5, $4, smallLoop
CMPBLT R5, $32, mediumLoopBody
L4: // n > 0
MOVD 0(R8)(R10*1), R5
SUBC R4, R5
SUBE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
largeLoop: // Copying 256 bytes at a time
MVC $256, 0(R6), 0(R8)
MOVD $256(R6), R6
MOVD $256(R8), R8
MOVD $-32(R5), R5
CMPBGE R5, $32, largeLoop
BR mediumLoop
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L4 // if n > 0 goto L4
E4:
MOVD R4, c+56(FP) // return c
mediumLoopBody: // Copying 32 bytes at a time
MVC $32, 0(R6), 0(R8)
MOVD $32(R6), R6
MOVD $32(R8), R8
MOVD $-4(R5), R5
CMPBGE R5, $4, mediumLoopBody
BR smallLoop
returnC:
MOVD R7, c+56(FP)
RET
// func shlVU(z, x []Word, s uint) (c Word)

View File

@ -30,12 +30,3 @@ func TestFunVVnovec(t *testing.T) {
}
}
}
func TestFunVWnovec(t *testing.T) {
if hasVX == true {
for _, a := range sumVW {
arg := argVW{a.x, a.z, a.y, a.c}
testFunVW(t, "subVW_novec", subVW_novec, arg)
}
}
}