internal/bytealg: optimize Count/CountString on arm64

Introduce ABIInternal support for Count/CountString
Move <32 size block from function end to beginning as fastpath

goos: linux
goarch: arm64
pkg: strings
                   │   base.txt   │               new.txt                │
                   │     B/s      │     B/s       vs base                │
CountByte/10         672.5Mi ± 0%   692.9Mi ± 0%   +3.04% (p=0.000 n=10)
CountByte/32         3.592Gi ± 0%   3.970Gi ± 0%  +10.53% (p=0.000 n=10)
CountByte/4096       16.63Gi ± 0%   16.73Gi ± 0%   +0.64% (p=0.000 n=10)
CountByte/4194304    14.97Gi ± 2%   15.02Gi ± 1%        ~ (p=0.190 n=10)
CountByte/67108864   12.50Gi ± 0%   12.50Gi ± 0%        ~ (p=0.853 n=10)
geomean              5.931Gi        6.099Gi        +2.83%

Change-Id: I5af1be2b117d9fb8d570739637499923de62251c
Reviewed-on: https://go-review.googlesource.com/c/go/+/662395
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Commit-Queue: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Vasily Leonenko 2025-03-31 23:25:00 +03:00 committed by Gopher Robot
parent 9326d9d012
commit f2e9076764

View File

@ -5,33 +5,45 @@
#include "go_asm.h"
#include "textflag.h"
TEXT ·Count(SB),NOSPLIT,$0-40
MOVD b_base+0(FP), R0
MOVD b_len+8(FP), R2
MOVBU c+24(FP), R1
MOVD $ret+32(FP), R8
B countbytebody<>(SB)
TEXT ·CountString(SB),NOSPLIT,$0-32
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU c+16(FP), R1
MOVD $ret+24(FP), R8
B countbytebody<>(SB)
// func Count(b []byte, c byte) int
// input:
// R0: data
// R2: data len
// R1: byte to find
// R8: address to put result
TEXT countbytebody<>(SB),NOSPLIT,$0
// R0: b ptr
// R1: b len
// R2: b cap
// R3: c byte to search
// return:
// R0: result
TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
MOVD R3, R2
B ·CountString<ABIInternal>(SB)
// func CountString(s string, c byte) int
// input:
// R0: s ptr
// R1: s len
// R2: c byte to search (due to ABIInternal upper bits can contain junk)
// return:
// R0: result
TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
// R11 = count of byte to search
MOVD $0, R11
// short path to handle 0-byte case
CBZ R2, done
CMP $0x20, R2
// jump directly to tail if length < 32
BLO tail
CBZ R1, done
CMP $0x20, R1
// jump directly to head if length >= 32
BHS head
tail:
// Work with tail shorter than 32 bytes
MOVBU.P 1(R0), R5
SUB $1, R1, R1
CMP R2.UXTB, R5
CINC EQ, R11, R11
CBNZ R1, tail
done:
MOVD R11, R0
RET
PCALIGN $16
head:
ANDS $0x1f, R0, R9
BEQ chunk
// Work with not 32-byte aligned head
@ -40,24 +52,23 @@ TEXT countbytebody<>(SB),NOSPLIT,$0
PCALIGN $16
head_loop:
MOVBU.P 1(R0), R5
CMP R5, R1
CMP R2.UXTB, R5
CINC EQ, R11, R11
SUB $1, R2, R2
SUB $1, R1, R1
CMP R0, R3
BNE head_loop
// Work with 32-byte aligned chunks
chunk:
BIC $0x1f, R2, R9
BIC $0x1f, R1, R9
// The first chunk can also be the last
CBZ R9, tail
// R3 = end of 32-byte chunks
ADD R0, R9, R3
MOVD $1, R5
VMOV R5, V5.B16
// R2 = length of tail
SUB R9, R2, R2
// Duplicate R1 (byte to search) to 16 1-byte elements of V0
VMOV R1, V0.B16
// R1 = length of tail
SUB R9, R1, R1
// Duplicate R2 (byte to search) to 16 1-byte elements of V0
VMOV R2, V0.B16
// Clear the low 64-bit element of V7 and V8
VEOR V7.B8, V7.B8, V7.B8
VEOR V8.B8, V8.B8, V8.B8
@ -79,14 +90,5 @@ chunk_loop:
BNE chunk_loop
VMOV V8.D[0], R6
ADD R6, R11, R11
CBZ R2, done
tail:
// Work with tail shorter than 32 bytes
MOVBU.P 1(R0), R5
SUB $1, R2, R2
CMP R5, R1
CINC EQ, R11, R11
CBNZ R2, tail
done:
MOVD R11, (R8)
RET
CBZ R1, done
B tail