mirror of
https://github.com/golang/go.git
synced 2025-05-05 15:43:04 +00:00
internal/bytealg: optimize Count/CountString on arm64
Introduce ABIInternal support for Count/CountString Move <32 size block from function end to beginning as fastpath goos: linux goarch: arm64 pkg: strings │ base.txt │ new.txt │ │ B/s │ B/s vs base │ CountByte/10 672.5Mi ± 0% 692.9Mi ± 0% +3.04% (p=0.000 n=10) CountByte/32 3.592Gi ± 0% 3.970Gi ± 0% +10.53% (p=0.000 n=10) CountByte/4096 16.63Gi ± 0% 16.73Gi ± 0% +0.64% (p=0.000 n=10) CountByte/4194304 14.97Gi ± 2% 15.02Gi ± 1% ~ (p=0.190 n=10) CountByte/67108864 12.50Gi ± 0% 12.50Gi ± 0% ~ (p=0.853 n=10) geomean 5.931Gi 6.099Gi +2.83% Change-Id: I5af1be2b117d9fb8d570739637499923de62251c Reviewed-on: https://go-review.googlesource.com/c/go/+/662395 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Commit-Queue: Keith Randall <khr@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
9326d9d012
commit
f2e9076764
@ -5,33 +5,45 @@
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·Count(SB),NOSPLIT,$0-40
|
||||
MOVD b_base+0(FP), R0
|
||||
MOVD b_len+8(FP), R2
|
||||
MOVBU c+24(FP), R1
|
||||
MOVD $ret+32(FP), R8
|
||||
B countbytebody<>(SB)
|
||||
|
||||
TEXT ·CountString(SB),NOSPLIT,$0-32
|
||||
MOVD s_base+0(FP), R0
|
||||
MOVD s_len+8(FP), R2
|
||||
MOVBU c+16(FP), R1
|
||||
MOVD $ret+24(FP), R8
|
||||
B countbytebody<>(SB)
|
||||
|
||||
// func Count(b []byte, c byte) int
|
||||
// input:
|
||||
// R0: data
|
||||
// R2: data len
|
||||
// R1: byte to find
|
||||
// R8: address to put result
|
||||
TEXT countbytebody<>(SB),NOSPLIT,$0
|
||||
// R0: b ptr
|
||||
// R1: b len
|
||||
// R2: b cap
|
||||
// R3: c byte to search
|
||||
// return:
|
||||
// R0: result
|
||||
TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
|
||||
MOVD R3, R2
|
||||
B ·CountString<ABIInternal>(SB)
|
||||
|
||||
// func CountString(s string, c byte) int
|
||||
// input:
|
||||
// R0: s ptr
|
||||
// R1: s len
|
||||
// R2: c byte to search (due to ABIInternal upper bits can contain junk)
|
||||
// return:
|
||||
// R0: result
|
||||
TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
|
||||
// R11 = count of byte to search
|
||||
MOVD $0, R11
|
||||
// short path to handle 0-byte case
|
||||
CBZ R2, done
|
||||
CMP $0x20, R2
|
||||
// jump directly to tail if length < 32
|
||||
BLO tail
|
||||
CBZ R1, done
|
||||
CMP $0x20, R1
|
||||
// jump directly to head if length >= 32
|
||||
BHS head
|
||||
tail:
|
||||
// Work with tail shorter than 32 bytes
|
||||
MOVBU.P 1(R0), R5
|
||||
SUB $1, R1, R1
|
||||
CMP R2.UXTB, R5
|
||||
CINC EQ, R11, R11
|
||||
CBNZ R1, tail
|
||||
done:
|
||||
MOVD R11, R0
|
||||
RET
|
||||
PCALIGN $16
|
||||
head:
|
||||
ANDS $0x1f, R0, R9
|
||||
BEQ chunk
|
||||
// Work with not 32-byte aligned head
|
||||
@ -40,24 +52,23 @@ TEXT countbytebody<>(SB),NOSPLIT,$0
|
||||
PCALIGN $16
|
||||
head_loop:
|
||||
MOVBU.P 1(R0), R5
|
||||
CMP R5, R1
|
||||
CMP R2.UXTB, R5
|
||||
CINC EQ, R11, R11
|
||||
SUB $1, R2, R2
|
||||
SUB $1, R1, R1
|
||||
CMP R0, R3
|
||||
BNE head_loop
|
||||
// Work with 32-byte aligned chunks
|
||||
chunk:
|
||||
BIC $0x1f, R2, R9
|
||||
BIC $0x1f, R1, R9
|
||||
// The first chunk can also be the last
|
||||
CBZ R9, tail
|
||||
// R3 = end of 32-byte chunks
|
||||
ADD R0, R9, R3
|
||||
MOVD $1, R5
|
||||
VMOV R5, V5.B16
|
||||
// R2 = length of tail
|
||||
SUB R9, R2, R2
|
||||
// Duplicate R1 (byte to search) to 16 1-byte elements of V0
|
||||
VMOV R1, V0.B16
|
||||
// R1 = length of tail
|
||||
SUB R9, R1, R1
|
||||
// Duplicate R2 (byte to search) to 16 1-byte elements of V0
|
||||
VMOV R2, V0.B16
|
||||
// Clear the low 64-bit element of V7 and V8
|
||||
VEOR V7.B8, V7.B8, V7.B8
|
||||
VEOR V8.B8, V8.B8, V8.B8
|
||||
@ -79,14 +90,5 @@ chunk_loop:
|
||||
BNE chunk_loop
|
||||
VMOV V8.D[0], R6
|
||||
ADD R6, R11, R11
|
||||
CBZ R2, done
|
||||
tail:
|
||||
// Work with tail shorter than 32 bytes
|
||||
MOVBU.P 1(R0), R5
|
||||
SUB $1, R2, R2
|
||||
CMP R5, R1
|
||||
CINC EQ, R11, R11
|
||||
CBNZ R2, tail
|
||||
done:
|
||||
MOVD R11, (R8)
|
||||
RET
|
||||
CBZ R1, done
|
||||
B tail
|
||||
|
Loading…
x
Reference in New Issue
Block a user