mirror of
https://github.com/golang/go.git
synced 2025-05-05 23:53:05 +00:00
internal/bytealg: optimize Count/CountString on arm64
Introduce ABIInternal support for Count/CountString Move <32 size block from function end to beginning as fastpath goos: linux goarch: arm64 pkg: strings │ base.txt │ new.txt │ │ B/s │ B/s vs base │ CountByte/10 672.5Mi ± 0% 692.9Mi ± 0% +3.04% (p=0.000 n=10) CountByte/32 3.592Gi ± 0% 3.970Gi ± 0% +10.53% (p=0.000 n=10) CountByte/4096 16.63Gi ± 0% 16.73Gi ± 0% +0.64% (p=0.000 n=10) CountByte/4194304 14.97Gi ± 2% 15.02Gi ± 1% ~ (p=0.190 n=10) CountByte/67108864 12.50Gi ± 0% 12.50Gi ± 0% ~ (p=0.853 n=10) geomean 5.931Gi 6.099Gi +2.83% Change-Id: I5af1be2b117d9fb8d570739637499923de62251c Reviewed-on: https://go-review.googlesource.com/c/go/+/662395 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Commit-Queue: Keith Randall <khr@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
9326d9d012
commit
f2e9076764
@ -5,33 +5,45 @@
|
|||||||
#include "go_asm.h"
|
#include "go_asm.h"
|
||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
|
||||||
TEXT ·Count(SB),NOSPLIT,$0-40
|
// func Count(b []byte, c byte) int
|
||||||
MOVD b_base+0(FP), R0
|
|
||||||
MOVD b_len+8(FP), R2
|
|
||||||
MOVBU c+24(FP), R1
|
|
||||||
MOVD $ret+32(FP), R8
|
|
||||||
B countbytebody<>(SB)
|
|
||||||
|
|
||||||
TEXT ·CountString(SB),NOSPLIT,$0-32
|
|
||||||
MOVD s_base+0(FP), R0
|
|
||||||
MOVD s_len+8(FP), R2
|
|
||||||
MOVBU c+16(FP), R1
|
|
||||||
MOVD $ret+24(FP), R8
|
|
||||||
B countbytebody<>(SB)
|
|
||||||
|
|
||||||
// input:
|
// input:
|
||||||
// R0: data
|
// R0: b ptr
|
||||||
// R2: data len
|
// R1: b len
|
||||||
// R1: byte to find
|
// R2: b cap
|
||||||
// R8: address to put result
|
// R3: c byte to search
|
||||||
TEXT countbytebody<>(SB),NOSPLIT,$0
|
// return:
|
||||||
|
// R0: result
|
||||||
|
TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
|
||||||
|
MOVD R3, R2
|
||||||
|
B ·CountString<ABIInternal>(SB)
|
||||||
|
|
||||||
|
// func CountString(s string, c byte) int
|
||||||
|
// input:
|
||||||
|
// R0: s ptr
|
||||||
|
// R1: s len
|
||||||
|
// R2: c byte to search (due to ABIInternal upper bits can contain junk)
|
||||||
|
// return:
|
||||||
|
// R0: result
|
||||||
|
TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
|
||||||
// R11 = count of byte to search
|
// R11 = count of byte to search
|
||||||
MOVD $0, R11
|
MOVD $0, R11
|
||||||
// short path to handle 0-byte case
|
// short path to handle 0-byte case
|
||||||
CBZ R2, done
|
CBZ R1, done
|
||||||
CMP $0x20, R2
|
CMP $0x20, R1
|
||||||
// jump directly to tail if length < 32
|
// jump directly to head if length >= 32
|
||||||
BLO tail
|
BHS head
|
||||||
|
tail:
|
||||||
|
// Work with tail shorter than 32 bytes
|
||||||
|
MOVBU.P 1(R0), R5
|
||||||
|
SUB $1, R1, R1
|
||||||
|
CMP R2.UXTB, R5
|
||||||
|
CINC EQ, R11, R11
|
||||||
|
CBNZ R1, tail
|
||||||
|
done:
|
||||||
|
MOVD R11, R0
|
||||||
|
RET
|
||||||
|
PCALIGN $16
|
||||||
|
head:
|
||||||
ANDS $0x1f, R0, R9
|
ANDS $0x1f, R0, R9
|
||||||
BEQ chunk
|
BEQ chunk
|
||||||
// Work with not 32-byte aligned head
|
// Work with not 32-byte aligned head
|
||||||
@ -40,24 +52,23 @@ TEXT countbytebody<>(SB),NOSPLIT,$0
|
|||||||
PCALIGN $16
|
PCALIGN $16
|
||||||
head_loop:
|
head_loop:
|
||||||
MOVBU.P 1(R0), R5
|
MOVBU.P 1(R0), R5
|
||||||
CMP R5, R1
|
CMP R2.UXTB, R5
|
||||||
CINC EQ, R11, R11
|
CINC EQ, R11, R11
|
||||||
SUB $1, R2, R2
|
SUB $1, R1, R1
|
||||||
CMP R0, R3
|
CMP R0, R3
|
||||||
BNE head_loop
|
BNE head_loop
|
||||||
// Work with 32-byte aligned chunks
|
|
||||||
chunk:
|
chunk:
|
||||||
BIC $0x1f, R2, R9
|
BIC $0x1f, R1, R9
|
||||||
// The first chunk can also be the last
|
// The first chunk can also be the last
|
||||||
CBZ R9, tail
|
CBZ R9, tail
|
||||||
// R3 = end of 32-byte chunks
|
// R3 = end of 32-byte chunks
|
||||||
ADD R0, R9, R3
|
ADD R0, R9, R3
|
||||||
MOVD $1, R5
|
MOVD $1, R5
|
||||||
VMOV R5, V5.B16
|
VMOV R5, V5.B16
|
||||||
// R2 = length of tail
|
// R1 = length of tail
|
||||||
SUB R9, R2, R2
|
SUB R9, R1, R1
|
||||||
// Duplicate R1 (byte to search) to 16 1-byte elements of V0
|
// Duplicate R2 (byte to search) to 16 1-byte elements of V0
|
||||||
VMOV R1, V0.B16
|
VMOV R2, V0.B16
|
||||||
// Clear the low 64-bit element of V7 and V8
|
// Clear the low 64-bit element of V7 and V8
|
||||||
VEOR V7.B8, V7.B8, V7.B8
|
VEOR V7.B8, V7.B8, V7.B8
|
||||||
VEOR V8.B8, V8.B8, V8.B8
|
VEOR V8.B8, V8.B8, V8.B8
|
||||||
@ -79,14 +90,5 @@ chunk_loop:
|
|||||||
BNE chunk_loop
|
BNE chunk_loop
|
||||||
VMOV V8.D[0], R6
|
VMOV V8.D[0], R6
|
||||||
ADD R6, R11, R11
|
ADD R6, R11, R11
|
||||||
CBZ R2, done
|
CBZ R1, done
|
||||||
tail:
|
B tail
|
||||||
// Work with tail shorter than 32 bytes
|
|
||||||
MOVBU.P 1(R0), R5
|
|
||||||
SUB $1, R2, R2
|
|
||||||
CMP R5, R1
|
|
||||||
CINC EQ, R11, R11
|
|
||||||
CBNZ R2, tail
|
|
||||||
done:
|
|
||||||
MOVD R11, (R8)
|
|
||||||
RET
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user