mirror of
https://github.com/golang/go.git
synced 2025-05-05 15:43:04 +00:00
internal/bytealg: optimize Index/IndexString/IndexByte/IndexByteString on arm64
Introduce ABIInternal support for Index/IndexString/IndexByte/IndexByteString goos: linux goarch: arm64 pkg: bytes │ base.txt │ new.txt │ │ B/s │ B/s vs base │ IndexByte/10 1.090Gi ± 0% 1.313Gi ± 0% +20.51% (p=0.000 n=10) IndexByte/32 3.714Gi ± 0% 4.289Gi ± 0% +15.47% (p=0.000 n=10) IndexByte/4K 22.92Gi ± 0% 23.01Gi ± 0% +0.37% (p=0.000 n=10) IndexByte/4M 20.23Gi ± 0% 20.35Gi ± 0% +0.60% (p=0.000 n=10) IndexByte/64M 23.82Gi ± 0% 23.81Gi ± 0% -0.01% (p=0.002 n=10) IndexBytePortable/10 788.5Mi ± 0% 788.5Mi ± 0% ~ (p=0.722 n=10) IndexBytePortable/32 1002.3Mi ± 0% 1002.3Mi ± 0% ~ (p=0.137 n=10) IndexBytePortable/4K 1.111Gi ± 0% 1.111Gi ± 0% ~ (p=0.692 n=10) IndexBytePortable/4M 1.116Gi ± 0% 1.116Gi ± 0% ~ (p=0.158 n=10) IndexBytePortable/64M 1.116Gi ± 0% 1.116Gi ± 0% -0.01% (p=0.000 n=10) IndexRune/10 352.1Mi ± 0% 445.0Mi ± 0% +26.38% (p=0.000 n=10) IndexRune/32 1.101Gi ± 0% 1.391Gi ± 0% +26.43% (p=0.000 n=10) IndexRune/4K 21.07Gi ± 0% 21.25Gi ± 0% +0.82% (p=0.000 n=10) IndexRune/4M 23.81Gi ± 0% 23.81Gi ± 0% ~ (p=0.218 n=10) IndexRune/64M 23.81Gi ± 0% 23.81Gi ± 0% ~ (p=0.271 n=10) IndexRuneASCII/10 1.038Gi ± 0% 1.190Gi ± 1% +14.63% (p=0.000 n=10) IndexRuneASCII/32 3.643Gi ± 2% 4.203Gi ± 0% +15.38% (p=0.000 n=10) IndexRuneASCII/4K 22.90Gi ± 0% 22.98Gi ± 0% +0.34% (p=0.000 n=10) IndexRuneASCII/4M 23.81Gi ± 0% 23.81Gi ± 0% ~ (p=0.108 n=10) IndexRuneASCII/64M 23.82Gi ± 0% 23.81Gi ± 0% ~ (p=0.105 n=10) IndexRuneUnicode/Latin/10 404.4Mi ± 0% 493.7Mi ± 0% +22.10% (p=0.000 n=10) IndexRuneUnicode/Latin/32 1.261Gi ± 0% 1.543Gi ± 0% +22.31% (p=0.000 n=10) IndexRuneUnicode/Latin/4K 6.966Gi ± 0% 8.115Gi ± 0% +16.50% (p=0.000 n=10) IndexRuneUnicode/Latin/4M 6.599Gi ± 0% 7.576Gi ± 0% +14.80% (p=0.000 n=10) IndexRuneUnicode/Latin/64M 6.297Gi ± 0% 7.070Gi ± 2% +12.28% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/10 385.9Mi ± 0% 440.1Mi ± 0% +14.03% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/32 1.206Gi ± 0% 1.375Gi ± 0% +14.05% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/4K 2.468Gi ± 0% 2.921Gi ± 0% +18.37% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/4M 2.386Gi ± 0% 2.845Gi ± 0% +19.23% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/64M 2.280Gi ± 0% 2.717Gi ± 0% +19.14% (p=0.000 n=10) IndexRuneUnicode/Han/10 307.1Mi ± 0% 331.5Mi ± 0% +7.94% (p=0.000 n=10) IndexRuneUnicode/Han/32 982.2Mi ± 0% 1060.2Mi ± 0% +7.94% (p=0.000 n=10) IndexRuneUnicode/Han/4K 4.986Gi ± 0% 5.957Gi ± 0% +19.48% (p=0.000 n=10) IndexRuneUnicode/Han/4M 3.822Gi ± 0% 4.198Gi ± 0% +9.83% (p=0.000 n=10) IndexRuneUnicode/Han/64M 3.765Gi ± 0% 4.140Gi ± 0% +9.96% (p=0.000 n=10) Index/10 634.6Mi ± 0% 635.2Mi ± 0% +0.09% (p=0.000 n=10) Index/32 375.3Mi ± 0% 385.1Mi ± 0% +2.63% (p=0.000 n=10) Index/4K 754.8Mi ± 0% 755.2Mi ± 0% +0.04% (p=0.001 n=10) Index/4M 746.5Mi ± 0% 746.3Mi ± 0% -0.03% (p=0.000 n=10) Index/64M 746.5Mi ± 0% 746.3Mi ± 0% -0.03% (p=0.000 n=10) IndexEasy/10 714.6Mi ± 0% 714.6Mi ± 0% +0.00% (p=0.001 n=10) IndexEasy/32 1.221Gi ± 0% 1.524Gi ± 0% +24.81% (p=0.000 n=10) IndexEasy/4K 21.06Gi ± 0% 21.47Gi ± 0% +1.91% (p=0.000 n=10) IndexEasy/4M 20.23Gi ± 0% 20.24Gi ± 0% ~ (p=0.684 n=10) IndexEasy/64M 13.07Gi ± 0% 12.58Gi ± 4% -3.75% (p=0.000 n=10) IndexHard1 1.114Gi ± 0% 1.114Gi ± 0% ~ (p=0.193 n=10) IndexHard2 1.111Gi ± 0% 1.112Gi ± 0% +0.04% (p=0.001 n=10) IndexHard3 1.086Gi ± 0% 1.081Gi ± 0% -0.37% (p=0.000 n=10) IndexHard4 607.9Mi ± 0% 607.9Mi ± 0% ~ (p=0.136 n=10) geomean 2.536Gi 2.720Gi +7.26% Change-Id: I1fc246783ebb215882d7144d05dbe2433dc66751 Reviewed-on: https://go-review.googlesource.com/c/go/+/662415 Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Keith Randall <khr@golang.org>
This commit is contained in:
parent
f2e9076764
commit
5b36f61356
@ -5,29 +5,30 @@
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·Index(SB),NOSPLIT,$0-56
|
||||
MOVD a_base+0(FP), R0
|
||||
MOVD a_len+8(FP), R1
|
||||
MOVD b_base+24(FP), R2
|
||||
MOVD b_len+32(FP), R3
|
||||
MOVD $ret+48(FP), R9
|
||||
B indexbody<>(SB)
|
||||
|
||||
TEXT ·IndexString(SB),NOSPLIT,$0-40
|
||||
MOVD a_base+0(FP), R0
|
||||
MOVD a_len+8(FP), R1
|
||||
MOVD b_base+16(FP), R2
|
||||
MOVD b_len+24(FP), R3
|
||||
MOVD $ret+32(FP), R9
|
||||
B indexbody<>(SB)
|
||||
|
||||
// func Index(a, b []byte) int
|
||||
// input:
|
||||
// R0: haystack
|
||||
// R1: length of haystack
|
||||
// R2: needle
|
||||
// R3: length of needle (2 <= len <= 32)
|
||||
// R9: address to put result
|
||||
TEXT indexbody<>(SB),NOSPLIT,$0-56
|
||||
// R0: a ptr (haystack)
|
||||
// R1: a len (haystack)
|
||||
// R2: a cap (haystack) (unused)
|
||||
// R3: b ptr (needle)
|
||||
// R4: b len (needle) (2 <= len <= 32)
|
||||
// R5: b cap (needle) (unused)
|
||||
// return:
|
||||
// R0: result
|
||||
TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
|
||||
MOVD R3, R2
|
||||
MOVD R4, R3
|
||||
B ·IndexString<ABIInternal>(SB)
|
||||
|
||||
// func IndexString(a, b string) int
|
||||
// input:
|
||||
// R0: a ptr (haystack)
|
||||
// R1: a len (haystack)
|
||||
// R2: b ptr (needle)
|
||||
// R3: b len (needle) (2 <= len <= 32)
|
||||
// return:
|
||||
// R0: result
|
||||
TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
|
||||
// main idea is to load 'sep' into separate register(s)
|
||||
// to avoid repeatedly re-load it again and again
|
||||
// for sebsequent substring comparisons
|
||||
@ -136,11 +137,9 @@ loop_2:
|
||||
BNE loop_2
|
||||
found:
|
||||
SUB R8, R0, R0
|
||||
MOVD R0, (R9)
|
||||
RET
|
||||
not_found:
|
||||
MOVD $-1, R0
|
||||
MOVD R0, (R9)
|
||||
RET
|
||||
greater_8:
|
||||
SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes
|
||||
|
@ -4,26 +4,26 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·IndexByte(SB),NOSPLIT,$0-40
|
||||
MOVD b_base+0(FP), R0
|
||||
MOVD b_len+8(FP), R2
|
||||
MOVBU c+24(FP), R1
|
||||
MOVD $ret+32(FP), R8
|
||||
B indexbytebody<>(SB)
|
||||
|
||||
TEXT ·IndexByteString(SB),NOSPLIT,$0-32
|
||||
MOVD s_base+0(FP), R0
|
||||
MOVD s_len+8(FP), R2
|
||||
MOVBU c+16(FP), R1
|
||||
MOVD $ret+24(FP), R8
|
||||
B indexbytebody<>(SB)
|
||||
|
||||
// func IndexByte(b []byte, c byte) int
|
||||
// input:
|
||||
// R0: data
|
||||
// R1: byte to search
|
||||
// R2: data len
|
||||
// R8: address to put result
|
||||
TEXT indexbytebody<>(SB),NOSPLIT,$0
|
||||
// R0: b ptr
|
||||
// R1: b len
|
||||
// R2: b cap (unused)
|
||||
// R3: c byte to search
|
||||
// return
|
||||
// R0: result
|
||||
TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
|
||||
MOVD R3, R2
|
||||
B ·IndexByteString<ABIInternal>(SB)
|
||||
|
||||
// func IndexByteString(s string, c byte) int
|
||||
// input:
|
||||
// R0: s ptr
|
||||
// R1: s len
|
||||
// R2: c byte to search
|
||||
// return
|
||||
// R0: result
|
||||
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
|
||||
// Core algorithm:
|
||||
// For each 32-byte chunk we calculate a 64-bit syndrome value,
|
||||
// with two bits per byte. For each tuple, bit 0 is set if the
|
||||
@ -33,19 +33,19 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
|
||||
// in the original string, counting trailing zeros allows to
|
||||
// identify exactly which byte has matched.
|
||||
|
||||
CBZ R2, fail
|
||||
CBZ R1, fail
|
||||
MOVD R0, R11
|
||||
// Magic constant 0x40100401 allows us to identify
|
||||
// which lane matches the requested byte.
|
||||
// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
|
||||
// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
|
||||
MOVD $0x40100401, R5
|
||||
VMOV R1, V0.B16
|
||||
VMOV R2, V0.B16
|
||||
// Work with aligned 32-byte chunks
|
||||
BIC $0x1f, R0, R3
|
||||
VMOV R5, V5.S4
|
||||
ANDS $0x1f, R0, R9
|
||||
AND $0x1f, R2, R10
|
||||
AND $0x1f, R1, R10
|
||||
BEQ loop
|
||||
|
||||
// Input string is not 32-byte aligned. We calculate the
|
||||
@ -53,7 +53,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
|
||||
// the first bytes and mask off the irrelevant part.
|
||||
VLD1.P (R3), [V1.B16, V2.B16]
|
||||
SUB $0x20, R9, R4
|
||||
ADDS R4, R2, R2
|
||||
ADDS R4, R1, R1
|
||||
VCMEQ V0.B16, V1.B16, V3.B16
|
||||
VCMEQ V0.B16, V2.B16, V4.B16
|
||||
VAND V5.B16, V3.B16, V3.B16
|
||||
@ -72,7 +72,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
|
||||
|
||||
loop:
|
||||
VLD1.P (R3), [V1.B16, V2.B16]
|
||||
SUBS $0x20, R2, R2
|
||||
SUBS $0x20, R1, R1
|
||||
VCMEQ V0.B16, V1.B16, V3.B16
|
||||
VCMEQ V0.B16, V2.B16, V4.B16
|
||||
// If we're out of data we finish regardless of the result
|
||||
@ -117,10 +117,8 @@ tail:
|
||||
ADD R6>>1, R3, R0
|
||||
// Compute the offset result
|
||||
SUB R11, R0, R0
|
||||
MOVD R0, (R8)
|
||||
RET
|
||||
|
||||
fail:
|
||||
MOVD $-1, R0
|
||||
MOVD R0, (R8)
|
||||
RET
|
||||
|
Loading…
x
Reference in New Issue
Block a user